Raw data
- Study summary: To determine the influence of differential Kdm6a expression in immune cells, whole transcriptome analysis for CD4+ T cells from WT and Kdm6a cKO mice were performed using RNA-Seq.
Loading packages
library(data.table)
library(tidyverse)
library(rmarkdown)
library(ggplot2)
library(pheatmap)
library(AnnotationHub)
library(tximport)
library(Rsubread)
library(DESeq2)
library(UpSetR)
library(ensembldb)
library(gridExtra)
Setting AnnotationHub
Assign your species of interest
DB <- "EnsDb" # Set your DB of interest
AnnotationSpecies <- "mus musculus" # Set your species
ah <- AnnotationHub(hub=getAnnotationHubOption("URL")) # Bring annotation DB
Running AnnotationHub
ahQuery <- query(ah,
pattern=c(DB, AnnotationSpecies),
ignore.case=T) # Filter annotation of interest
# Select the most recent data
DBName <- mcols(ahQuery) %>%
rownames() %>%
tail(1)
# Save the dataset of interest
AnnoDb <- ah[[DBName]]
# Explore your EnsDb object with following accessors:
# columns(AnnpDb)
# keytypes(AnnoDb)
# keys(AnnoDb, keytype=..)
# select(AnnoDb, keys=.., columns=.., keytype=...)
AnnoKey <- keys(AnnoDb, keytype="TXID")
# Note: Annotation has to be done with not genome but transcripts
AnnoDb <- select(AnnoDb,
AnnoKey,
keytype="TXID",
columns=c("GENEID", "GENENAME"))
# Check if your AnnoDb has been extracted and saved correctely
class(AnnoDb)
## [1] "data.frame"
head(AnnoDb) # The column 1 has to assign transcript (e.g. ENSEMBLTRANS)
## TXID GENEID GENENAME
## 1 ENSMUST00000000001 ENSMUSG00000000001 Gnai3
## 2 ENSMUST00000000003 ENSMUSG00000000003 Pbsn
## 3 ENSMUST00000000010 ENSMUSG00000020875 Hoxb9
## 4 ENSMUST00000000028 ENSMUSG00000000028 Cdc45
## 5 ENSMUST00000000033 ENSMUSG00000048583 Igf2
## 6 ENSMUST00000000049 ENSMUSG00000000049 Apoh
featureCounts parameter setting
# "mm10", "mm9", "hg38", or "hg19"
# cf. mm10 = GRCm38
annot.inbuilt <- "mm10"
# GTF file path
annot.ext <- "../../mouse_reference/gencode.m25.primary_assembly.annotation.gtf"
# annotation type:
# e.g.: "gene_id", "transcript_id", or "gene_name"
GTF.attrType <- "gene_id"
# Number of cores
nthread <- 16
# Set a function importing counts from BAM files with featureCounts()
fcounts.fn <- function(vec) {
fc <- featureCounts(files=vec, # a vector assigning BAM file paths
annot.inbuilt=annot.inbuilt,
annot.ext=annot.ext,
GTF.attrType=GTF.attrType,
isGTFAnnotationFile=T,
nthread=nthread,
isPairedEnd=F, # Set this parameter correctly
verbose=T)
return(fc$counts)
}
Importing counts
Importing Salmon counts
Note:
txi1 <- tximport(…, txOut=F)
txi2 <- tximport(…, txOut=T)
txi2 <- summarizedToGene(…)
counts extracted from txi1 and txi2 are the same
# Import gene level summarized counts
salmon.txi <- tximport(metadata$Salmon_path,
type = "salmon",
tx2gene=AnnoDb,
ignoreTxVersion=T,
txOut=F) # TRUE for transcript level, FALSE for gene level
# Extract the counts and save as a data frame
salmon.counts <- salmon.txi$counts
# Explore the salmon count data frame
head(salmon.counts)
## [,1] [,2] [,3] [,4] [,5] [,6]
## ENSMUSG00000000001 6492.000 5822.000 5614.000 6040.00 6142.00 5437.000
## ENSMUSG00000000003 0.000 0.000 0.000 0.00 0.00 0.000
## ENSMUSG00000000028 3904.066 3567.989 3714.192 3410.46 3463.04 3142.454
## ENSMUSG00000000031 33.000 45.000 58.000 12.00 28.00 70.000
## ENSMUSG00000000037 119.001 90.001 82.000 165.00 129.00 100.000
## ENSMUSG00000000049 0.000 0.000 0.000 1.00 2.00 0.000
dim(salmon.counts)
## [1] 51956 6
summary(salmon.counts)
## V1 V2 V3 V4
## Min. : 0 Min. : 0.0 Min. : 0.0 Min. : 0.0
## 1st Qu.: 0 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0.0
## Median : 0 Median : 0.0 Median : 0.0 Median : 0.0
## Mean : 721 Mean : 642.5 Mean : 627.2 Mean : 658.2
## 3rd Qu.: 22 3rd Qu.: 20.0 3rd Qu.: 20.0 3rd Qu.: 21.2
## Max. :357534 Max. :322612.0 Max. :318293.5 Max. :343971.6
## V5 V6
## Min. : 0.0 Min. : 0.0
## 1st Qu.: 0.0 1st Qu.: 0.0
## Median : 0.0 Median : 0.0
## Mean : 662.9 Mean : 568.5
## 3rd Qu.: 20.1 3rd Qu.: 18.0
## Max. :337557.2 Max. :305036.9
Importing STAR counts
# Extract counts by running featureCounts()
star.counts <- fcounts.fn(metadata$STAR_path)
##
## ========== _____ _ _ ____ _____ ______ _____
## ===== / ____| | | | _ \| __ \| ____| /\ | __ \
## ===== | (___ | | | | |_) | |__) | |__ / \ | | | |
## ==== \___ \| | | | _ <| _ /| __| / /\ \ | | | |
## ==== ____) | |__| | |_) | | \ \| |____ / ____ \| |__| |
## ========== |_____/ \____/|____/|_| \_\______/_/ \_\_____/
## Rsubread 2.4.0
##
## //========================== featureCounts setting ===========================\\
## || ||
## || Input files : 6 BAM files ||
## || ||
## || WT-rep1Aligned.sortedByCoord.out.bam ||
## || WT-rep2Aligned.sortedByCoord.out.bam ||
## || WT-rep3Aligned.sortedByCoord.out.bam ||
## || cKO-rep1Aligned.sortedByCoord.out.bam ||
## || cKO-rep2Aligned.sortedByCoord.out.bam ||
## || cKO-rep3Aligned.sortedByCoord.out.bam ||
## || ||
## || Paired-end : no ||
## || Count read pairs : no ||
## || Annotation : gencode.m25.primary_assembly.annotation.gtf ... ||
## || Dir for temp files : . ||
## || Threads : 16 ||
## || Level : meta-feature level ||
## || Multimapping reads : counted ||
## || Multi-overlapping reads : not counted ||
## || Min overlapping bases : 1 ||
## || ||
## \\============================================================================//
##
## //================================= Running ==================================\\
## || ||
## || Load annotation file gencode.m25.primary_assembly.annotation.gtf ... ||
## || Features : 843712 ||
## || Meta-features : 55487 ||
## || Chromosomes/contigs : 45 ||
## || ||
## || Process BAM file WT-rep1Aligned.sortedByCoord.out.bam... ||
## || ||
## || Chromosomes/contigs in input file but not in annotation ||
## || GL456359.1 ||
## || GL456393.1 ||
## || GL456389.1 ||
## || JH584300.1 ||
## || GL456368.1 ||
## || GL456394.1 ||
## || GL456360.1 ||
## || JH584301.1 ||
## || GL456390.1 ||
## || GL456213.1 ||
## || GL456382.1 ||
## || GL456378.1 ||
## || JH584302.1 ||
## || GL456387.1 ||
## || GL456370.1 ||
## || GL456366.1 ||
## || GL456383.1 ||
## || GL456379.1 ||
## || GL456396.1 ||
## || GL456392.1 ||
## || GL456367.1 ||
## || ||
## || Single-end reads are included. ||
## || Total alignments : 70746461 ||
## || Successfully assigned alignments : 55979059 (79.1%) ||
## || Running time : 0.07 minutes ||
## || ||
## || Process BAM file WT-rep2Aligned.sortedByCoord.out.bam... ||
## || ||
## || Chromosomes/contigs in input file but not in annotation ||
## || GL456359.1 ||
## || GL456393.1 ||
## || GL456389.1 ||
## || JH584300.1 ||
## || GL456368.1 ||
## || GL456394.1 ||
## || GL456360.1 ||
## || JH584301.1 ||
## || GL456390.1 ||
## || GL456213.1 ||
## || GL456382.1 ||
## || GL456378.1 ||
## || JH584302.1 ||
## || GL456387.1 ||
## || GL456370.1 ||
## || GL456366.1 ||
## || GL456383.1 ||
## || GL456379.1 ||
## || GL456396.1 ||
## || GL456392.1 ||
## || GL456367.1 ||
## || ||
## || Single-end reads are included. ||
## || Total alignments : 63394034 ||
## || Successfully assigned alignments : 50159175 (79.1%) ||
## || Running time : 0.06 minutes ||
## || ||
## || Process BAM file WT-rep3Aligned.sortedByCoord.out.bam... ||
## || ||
## || Chromosomes/contigs in input file but not in annotation ||
## || GL456359.1 ||
## || GL456393.1 ||
## || GL456389.1 ||
## || JH584300.1 ||
## || GL456368.1 ||
## || GL456394.1 ||
## || GL456360.1 ||
## || JH584301.1 ||
## || GL456390.1 ||
## || GL456213.1 ||
## || GL456382.1 ||
## || GL456378.1 ||
## || JH584302.1 ||
## || GL456387.1 ||
## || GL456370.1 ||
## || GL456366.1 ||
## || GL456383.1 ||
## || GL456379.1 ||
## || GL456396.1 ||
## || GL456392.1 ||
## || GL456367.1 ||
## || ||
## || Single-end reads are included. ||
## || Total alignments : 61430033 ||
## || Successfully assigned alignments : 48522991 (79.0%) ||
## || Running time : 0.06 minutes ||
## || ||
## || Process BAM file cKO-rep1Aligned.sortedByCoord.out.bam... ||
## || ||
## || Chromosomes/contigs in input file but not in annotation ||
## || GL456359.1 ||
## || GL456393.1 ||
## || GL456389.1 ||
## || JH584300.1 ||
## || GL456368.1 ||
## || GL456394.1 ||
## || GL456360.1 ||
## || JH584301.1 ||
## || GL456390.1 ||
## || GL456213.1 ||
## || GL456382.1 ||
## || GL456378.1 ||
## || JH584302.1 ||
## || GL456387.1 ||
## || GL456370.1 ||
## || GL456366.1 ||
## || GL456383.1 ||
## || GL456379.1 ||
## || GL456396.1 ||
## || GL456392.1 ||
## || GL456367.1 ||
## || ||
## || Single-end reads are included. ||
## || Total alignments : 64533984 ||
## || Successfully assigned alignments : 51163980 (79.3%) ||
## || Running time : 0.07 minutes ||
## || ||
## || Process BAM file cKO-rep2Aligned.sortedByCoord.out.bam... ||
## || ||
## || Chromosomes/contigs in input file but not in annotation ||
## || GL456359.1 ||
## || GL456393.1 ||
## || GL456389.1 ||
## || JH584300.1 ||
## || GL456368.1 ||
## || GL456394.1 ||
## || GL456360.1 ||
## || JH584301.1 ||
## || GL456390.1 ||
## || GL456213.1 ||
## || GL456382.1 ||
## || GL456378.1 ||
## || JH584302.1 ||
## || GL456387.1 ||
## || GL456370.1 ||
## || GL456366.1 ||
## || GL456383.1 ||
## || GL456379.1 ||
## || GL456396.1 ||
## || GL456392.1 ||
## || GL456367.1 ||
## || ||
## || Single-end reads are included. ||
## || Total alignments : 65134349 ||
## || Successfully assigned alignments : 51790758 (79.5%) ||
## || Running time : 0.07 minutes ||
## || ||
## || Process BAM file cKO-rep3Aligned.sortedByCoord.out.bam... ||
## || ||
## || Chromosomes/contigs in input file but not in annotation ||
## || GL456359.1 ||
## || GL456393.1 ||
## || GL456389.1 ||
## || JH584300.1 ||
## || GL456368.1 ||
## || GL456394.1 ||
## || GL456360.1 ||
## || JH584301.1 ||
## || GL456390.1 ||
## || GL456213.1 ||
## || GL456382.1 ||
## || GL456378.1 ||
## || JH584302.1 ||
## || GL456387.1 ||
## || GL456370.1 ||
## || GL456366.1 ||
## || GL456383.1 ||
## || GL456379.1 ||
## || GL456396.1 ||
## || GL456392.1 ||
## || GL456367.1 ||
## || ||
## || Single-end reads are included. ||
## || Total alignments : 56200709 ||
## || Successfully assigned alignments : 44128747 (78.5%) ||
## || Running time : 0.06 minutes ||
## || ||
## || Write the final count table. ||
## || Write the read assignment summary. ||
## || ||
## \\============================================================================//
# Explore the STAR count data frame
head(star.counts)
## WT-rep1Aligned.sortedByCoord.out.bam
## ENSMUSG00000102693.1 0
## ENSMUSG00000064842.1 0
## ENSMUSG00000051951.5 0
## ENSMUSG00000102851.1 0
## ENSMUSG00000103377.1 1
## ENSMUSG00000104017.1 0
## WT-rep2Aligned.sortedByCoord.out.bam
## ENSMUSG00000102693.1 0
## ENSMUSG00000064842.1 0
## ENSMUSG00000051951.5 0
## ENSMUSG00000102851.1 0
## ENSMUSG00000103377.1 2
## ENSMUSG00000104017.1 0
## WT-rep3Aligned.sortedByCoord.out.bam
## ENSMUSG00000102693.1 0
## ENSMUSG00000064842.1 0
## ENSMUSG00000051951.5 0
## ENSMUSG00000102851.1 0
## ENSMUSG00000103377.1 0
## ENSMUSG00000104017.1 2
## cKO-rep1Aligned.sortedByCoord.out.bam
## ENSMUSG00000102693.1 0
## ENSMUSG00000064842.1 0
## ENSMUSG00000051951.5 0
## ENSMUSG00000102851.1 0
## ENSMUSG00000103377.1 0
## ENSMUSG00000104017.1 1
## cKO-rep2Aligned.sortedByCoord.out.bam
## ENSMUSG00000102693.1 0
## ENSMUSG00000064842.1 0
## ENSMUSG00000051951.5 0
## ENSMUSG00000102851.1 0
## ENSMUSG00000103377.1 0
## ENSMUSG00000104017.1 0
## cKO-rep3Aligned.sortedByCoord.out.bam
## ENSMUSG00000102693.1 1
## ENSMUSG00000064842.1 0
## ENSMUSG00000051951.5 0
## ENSMUSG00000102851.1 1
## ENSMUSG00000103377.1 0
## ENSMUSG00000104017.1 0
dim(star.counts)
## [1] 55487 6
summary(star.counts)
## WT-rep1Aligned.sortedByCoord.out.bam WT-rep2Aligned.sortedByCoord.out.bam
## Min. : 0 Min. : 0
## 1st Qu.: 0 1st Qu.: 0
## Median : 1 Median : 1
## Mean : 1009 Mean : 904
## 3rd Qu.: 86 3rd Qu.: 78
## Max. :811870 Max. :470203
## WT-rep3Aligned.sortedByCoord.out.bam cKO-rep1Aligned.sortedByCoord.out.bam
## Min. : 0.0 Min. : 0.0
## 1st Qu.: 0.0 1st Qu.: 0.0
## Median : 1.0 Median : 1.0
## Mean : 874.5 Mean : 922.1
## 3rd Qu.: 77.0 3rd Qu.: 82.0
## Max. :388953.0 Max. :459595.0
## cKO-rep2Aligned.sortedByCoord.out.bam cKO-rep3Aligned.sortedByCoord.out.bam
## Min. : 0.0 Min. : 0.0
## 1st Qu.: 0.0 1st Qu.: 0.0
## Median : 1.0 Median : 1.0
## Mean : 933.4 Mean : 795.3
## 3rd Qu.: 81.0 3rd Qu.: 70.0
## Max. :412338.0 Max. :390466.0
Importing HISAT2 counts
# Extract counts by running featureCounts()
hisat2.counts <- fcounts.fn(metadata$HISAT2_path)
##
## ========== _____ _ _ ____ _____ ______ _____
## ===== / ____| | | | _ \| __ \| ____| /\ | __ \
## ===== | (___ | | | | |_) | |__) | |__ / \ | | | |
## ==== \___ \| | | | _ <| _ /| __| / /\ \ | | | |
## ==== ____) | |__| | |_) | | \ \| |____ / ____ \| |__| |
## ========== |_____/ \____/|____/|_| \_\______/_/ \_\_____/
## Rsubread 2.4.0
##
## //========================== featureCounts setting ===========================\\
## || ||
## || Input files : 6 BAM files ||
## || ||
## || WT-rep1.sorted.bam ||
## || WT-rep2.sorted.bam ||
## || WT-rep3.sorted.bam ||
## || cKO-rep1.sorted.bam ||
## || cKO-rep2.sorted.bam ||
## || cKO-rep3.sorted.bam ||
## || ||
## || Paired-end : no ||
## || Count read pairs : no ||
## || Annotation : gencode.m25.primary_assembly.annotation.gtf ... ||
## || Dir for temp files : . ||
## || Threads : 16 ||
## || Level : meta-feature level ||
## || Multimapping reads : counted ||
## || Multi-overlapping reads : not counted ||
## || Min overlapping bases : 1 ||
## || ||
## \\============================================================================//
##
## //================================= Running ==================================\\
## || ||
## || Load annotation file gencode.m25.primary_assembly.annotation.gtf ... ||
## || Features : 843712 ||
## || Meta-features : 55487 ||
## || Chromosomes/contigs : 45 ||
## || ||
## || Process BAM file WT-rep1.sorted.bam... ||
## || ||
## || Chromosomes/contigs in input file but not in annotation ||
## || GL456359.1 ||
## || GL456393.1 ||
## || GL456389.1 ||
## || JH584300.1 ||
## || GL456368.1 ||
## || GL456394.1 ||
## || GL456360.1 ||
## || JH584301.1 ||
## || GL456390.1 ||
## || GL456213.1 ||
## || GL456382.1 ||
## || GL456378.1 ||
## || JH584302.1 ||
## || GL456387.1 ||
## || GL456370.1 ||
## || GL456366.1 ||
## || GL456383.1 ||
## || GL456379.1 ||
## || GL456396.1 ||
## || GL456392.1 ||
## || GL456367.1 ||
## || ||
## || Single-end reads are included. ||
## || Total alignments : 63363619 ||
## || Successfully assigned alignments : 48324485 (76.3%) ||
## || Running time : 0.06 minutes ||
## || ||
## || Process BAM file WT-rep2.sorted.bam... ||
## || ||
## || Chromosomes/contigs in input file but not in annotation ||
## || GL456359.1 ||
## || GL456393.1 ||
## || GL456389.1 ||
## || JH584300.1 ||
## || GL456368.1 ||
## || GL456394.1 ||
## || GL456360.1 ||
## || JH584301.1 ||
## || GL456390.1 ||
## || GL456213.1 ||
## || GL456382.1 ||
## || GL456378.1 ||
## || JH584302.1 ||
## || GL456387.1 ||
## || GL456370.1 ||
## || GL456366.1 ||
## || GL456383.1 ||
## || GL456379.1 ||
## || GL456396.1 ||
## || GL456392.1 ||
## || GL456367.1 ||
## || ||
## || Single-end reads are included. ||
## || Total alignments : 56580014 ||
## || Successfully assigned alignments : 43217333 (76.4%) ||
## || Running time : 0.06 minutes ||
## || ||
## || Process BAM file WT-rep3.sorted.bam... ||
## || ||
## || Chromosomes/contigs in input file but not in annotation ||
## || GL456359.1 ||
## || GL456393.1 ||
## || GL456389.1 ||
## || JH584300.1 ||
## || GL456368.1 ||
## || GL456394.1 ||
## || GL456360.1 ||
## || JH584301.1 ||
## || GL456390.1 ||
## || GL456213.1 ||
## || GL456382.1 ||
## || GL456378.1 ||
## || JH584302.1 ||
## || GL456387.1 ||
## || GL456370.1 ||
## || GL456366.1 ||
## || GL456383.1 ||
## || GL456379.1 ||
## || GL456396.1 ||
## || GL456392.1 ||
## || GL456367.1 ||
## || ||
## || Single-end reads are included. ||
## || Total alignments : 54969075 ||
## || Successfully assigned alignments : 41849987 (76.1%) ||
## || Running time : 0.06 minutes ||
## || ||
## || Process BAM file cKO-rep1.sorted.bam... ||
## || ||
## || Chromosomes/contigs in input file but not in annotation ||
## || GL456359.1 ||
## || GL456393.1 ||
## || GL456389.1 ||
## || JH584300.1 ||
## || GL456368.1 ||
## || GL456394.1 ||
## || GL456360.1 ||
## || JH584301.1 ||
## || GL456390.1 ||
## || GL456213.1 ||
## || GL456382.1 ||
## || GL456378.1 ||
## || JH584302.1 ||
## || GL456387.1 ||
## || GL456370.1 ||
## || GL456366.1 ||
## || GL456383.1 ||
## || GL456379.1 ||
## || GL456396.1 ||
## || GL456392.1 ||
## || GL456367.1 ||
## || ||
## || Single-end reads are included. ||
## || Total alignments : 57670244 ||
## || Successfully assigned alignments : 44111169 (76.5%) ||
## || Running time : 0.06 minutes ||
## || ||
## || Process BAM file cKO-rep2.sorted.bam... ||
## || ||
## || Chromosomes/contigs in input file but not in annotation ||
## || GL456359.1 ||
## || GL456393.1 ||
## || GL456389.1 ||
## || JH584300.1 ||
## || GL456368.1 ||
## || GL456394.1 ||
## || GL456360.1 ||
## || JH584301.1 ||
## || GL456390.1 ||
## || GL456213.1 ||
## || GL456382.1 ||
## || GL456378.1 ||
## || JH584302.1 ||
## || GL456387.1 ||
## || GL456370.1 ||
## || GL456366.1 ||
## || GL456383.1 ||
## || GL456379.1 ||
## || GL456396.1 ||
## || GL456392.1 ||
## || GL456367.1 ||
## || ||
## || Single-end reads are included. ||
## || Total alignments : 58054098 ||
## || Successfully assigned alignments : 44514481 (76.7%) ||
## || Running time : 0.06 minutes ||
## || ||
## || Process BAM file cKO-rep3.sorted.bam... ||
## || ||
## || Chromosomes/contigs in input file but not in annotation ||
## || GL456359.1 ||
## || GL456393.1 ||
## || GL456389.1 ||
## || JH584300.1 ||
## || GL456368.1 ||
## || GL456394.1 ||
## || GL456360.1 ||
## || JH584301.1 ||
## || GL456390.1 ||
## || GL456213.1 ||
## || GL456382.1 ||
## || GL456378.1 ||
## || JH584302.1 ||
## || GL456387.1 ||
## || GL456370.1 ||
## || GL456366.1 ||
## || GL456383.1 ||
## || GL456379.1 ||
## || GL456396.1 ||
## || GL456392.1 ||
## || GL456367.1 ||
## || ||
## || Single-end reads are included. ||
## || Total alignments : 50205194 ||
## || Successfully assigned alignments : 38062143 (75.8%) ||
## || Running time : 0.05 minutes ||
## || ||
## || Write the final count table. ||
## || Write the read assignment summary. ||
## || ||
## \\============================================================================//
# Explore the HISAT2 count data frame
head(hisat2.counts)
## WT-rep1.sorted.bam WT-rep2.sorted.bam WT-rep3.sorted.bam
## ENSMUSG00000102693.1 0 0 0
## ENSMUSG00000064842.1 0 0 0
## ENSMUSG00000051951.5 0 0 0
## ENSMUSG00000102851.1 0 0 0
## ENSMUSG00000103377.1 1 2 0
## ENSMUSG00000104017.1 0 0 2
## cKO-rep1.sorted.bam cKO-rep2.sorted.bam
## ENSMUSG00000102693.1 0 0
## ENSMUSG00000064842.1 0 0
## ENSMUSG00000051951.5 0 0
## ENSMUSG00000102851.1 0 0
## ENSMUSG00000103377.1 0 0
## ENSMUSG00000104017.1 1 0
## cKO-rep3.sorted.bam
## ENSMUSG00000102693.1 1
## ENSMUSG00000064842.1 0
## ENSMUSG00000051951.5 0
## ENSMUSG00000102851.1 1
## ENSMUSG00000103377.1 0
## ENSMUSG00000104017.1 0
dim(hisat2.counts)
## [1] 55487 6
summary(hisat2.counts)
## WT-rep1.sorted.bam WT-rep2.sorted.bam WT-rep3.sorted.bam cKO-rep1.sorted.bam
## Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 0.0
## 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0.0
## Median : 1.0 Median : 1.0 Median : 1.0 Median : 1.0
## Mean : 870.9 Mean : 778.9 Mean : 754.2 Mean : 795.0
## 3rd Qu.: 85.0 3rd Qu.: 77.0 3rd Qu.: 76.0 3rd Qu.: 81.5
## Max. :755391.0 Max. :436946.0 Max. :361243.0 Max. :428302.0
## cKO-rep2.sorted.bam cKO-rep3.sorted.bam
## Min. : 0.0 Min. : 0
## 1st Qu.: 0.0 1st Qu.: 0
## Median : 1.0 Median : 1
## Mean : 802.3 Mean : 686
## 3rd Qu.: 81.0 3rd Qu.: 69
## Max. :383887.0 Max. :363272
Data cleaning: sample and gene annotation
countList <- list(salmon.counts,
star.counts,
hisat2.counts)
# Assign names of the count data frames in the count list
names(countList) <- Aligners
# Set a function cleaning the count data frame
clean.fn <- function(df) {
# Convert to a data frame
df <- as.data.frame(df)
# Assign column names
names(df) <- SampleNames
# Bring row names to a column
df <- df %>% rownames_to_column(var="GENEID")
return(df)
}
# Set a function to drop GENEID version
clean.annotation.fn <- function(df) {
# Re-annotate without version specification
df <- separate(df, "GENEID", c("GENEID", "Version"))
# Remove version column
df <- df[, colnames(df) != "Version"]
return(df)
}
# Move GENEID to a column
for (x in Aligners) {
countList[[x]] <- clean.fn(countList[[x]])
}
# Remove version of GENEID and duplicated rows in STAR & HISAT2 count tables
for (x in Aligners) {
countList[[x]] <- clean.annotation.fn(countList[[x]]) %>%
distinct()
}
# Explore the cleaned count data frames
head(countList[[1]])
## GENEID WT-rep1 WT-rep2 WT-rep3 cKO-rep1 cKO-rep2 cKO-rep3
## 1 ENSMUSG00000000001 6492.000 5822.000 5614.000 6040.00 6142.00 5437.000
## 2 ENSMUSG00000000003 0.000 0.000 0.000 0.00 0.00 0.000
## 3 ENSMUSG00000000028 3904.066 3567.989 3714.192 3410.46 3463.04 3142.454
## 4 ENSMUSG00000000031 33.000 45.000 58.000 12.00 28.00 70.000
## 5 ENSMUSG00000000037 119.001 90.001 82.000 165.00 129.00 100.000
## 6 ENSMUSG00000000049 0.000 0.000 0.000 1.00 2.00 0.000
head(countList[[2]])
## GENEID WT-rep1 WT-rep2 WT-rep3 cKO-rep1 cKO-rep2 cKO-rep3
## 1 ENSMUSG00000102693 0 0 0 0 0 1
## 2 ENSMUSG00000064842 0 0 0 0 0 0
## 3 ENSMUSG00000051951 0 0 0 0 0 0
## 4 ENSMUSG00000102851 0 0 0 0 0 1
## 5 ENSMUSG00000103377 1 2 0 0 0 0
## 6 ENSMUSG00000104017 0 0 2 1 0 0
head(countList[[3]])
## GENEID WT-rep1 WT-rep2 WT-rep3 cKO-rep1 cKO-rep2 cKO-rep3
## 1 ENSMUSG00000102693 0 0 0 0 0 1
## 2 ENSMUSG00000064842 0 0 0 0 0 0
## 3 ENSMUSG00000051951 0 0 0 0 0 0
## 4 ENSMUSG00000102851 0 0 0 0 0 1
## 5 ENSMUSG00000103377 1 2 0 0 0 0
## 6 ENSMUSG00000104017 0 0 2 1 0 0
dim(countList[[1]])
## [1] 51956 7
dim(countList[[2]])
## [1] 55487 7
dim(countList[[3]])
## [1] 55487 7
sum(duplicated(countList[[1]]))
## [1] 0
sum(duplicated(countList[[2]]))
## [1] 0
sum(duplicated(countList[[3]]))
## [1] 0
# Convert Salmon counts to integers
countList[["Salmon"]] <- cbind(GENEID=countList[["Salmon"]][, "GENEID"],
round(countList[["Salmon"]][,
colnames(countList[["Salmon"]]) %in% SampleNames]))
# Explore the cleaned count data frames
head(countList[[1]])
## GENEID WT-rep1 WT-rep2 WT-rep3 cKO-rep1 cKO-rep2 cKO-rep3
## 1 ENSMUSG00000000001 6492 5822 5614 6040 6142 5437
## 2 ENSMUSG00000000003 0 0 0 0 0 0
## 3 ENSMUSG00000000028 3904 3568 3714 3410 3463 3142
## 4 ENSMUSG00000000031 33 45 58 12 28 70
## 5 ENSMUSG00000000037 119 90 82 165 129 100
## 6 ENSMUSG00000000049 0 0 0 1 2 0
Plotting sequencing depth
Number of total counts per sample
# Set a function generating a data frame with sequencing depth
seq.depth.fn <- function(df, aligner) {
seqdf <- as.data.frame(colSums(df[, SampleNames])) %>%
rownames_to_column (var="Sample") %>%
mutate(Aligner=aligner)
names(seqdf) <- c("Sample", "Count", "Aligner")
return(seqdf)
}
# Set a function for a bar plot comparing values
comparing.barplot.fn <- function(df, yval, title, ytitle) {
ggplot(df,
aes(x=Sample, y=yval, group=Aligner, fill=Aligner)) +
geom_bar(stat="identity", position="dodge") +
theme_bw() +
ggtitle(title) +
ylab(ytitle)+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
}
# Initialize the seq depth data frame with the first aligner
seq.depth.df <- seq.depth.fn(countList[[1]], Aligners[1])
# Extend the seq depth data frame with the rest of aligners
for (x in Aligners) {
if (x %in% Aligners[2:length(Aligners)]) {
seq.depth.df <- rbind(seq.depth.df,
seq.depth.fn(countList[[x]], x))
}
}
# Explore how the data frame
print(seq.depth.df)
## Sample Count Aligner
## 1 WT-rep1 37460304 Salmon
## 2 WT-rep2 33384135 Salmon
## 3 WT-rep3 32585959 Salmon
## 4 cKO-rep1 34197855 Salmon
## 5 cKO-rep2 34442159 Salmon
## 6 cKO-rep3 29538115 Salmon
## 7 WT-rep1 55979059 STAR
## 8 WT-rep2 50159175 STAR
## 9 WT-rep3 48522991 STAR
## 10 cKO-rep1 51163980 STAR
## 11 cKO-rep2 51790758 STAR
## 12 cKO-rep3 44128747 STAR
## 13 WT-rep1 48324485 HISAT2
## 14 WT-rep2 43217333 HISAT2
## 15 WT-rep3 41849987 HISAT2
## 16 cKO-rep1 44111169 HISAT2
## 17 cKO-rep2 44514481 HISAT2
## 18 cKO-rep3 38062143 HISAT2
summary(seq.depth.df)
## Sample Count Aligner
## Length:18 Min. :29538115 Length:18
## Class :character 1st Qu.:35196695 Class :character
## Mode :character Median :43664251 Mode :character
## Mean :42412935
## 3rd Qu.:48473364
## Max. :55979059
# Convert character vectors to factors
seq.depth.df$Sample <- factor(seq.depth.df$Sample,
levels=SampleNames)
seq.depth.df$Aligner <- factor(seq.depth.df$Aligner,
levels=Aligners)
# Create a plot presenting sequencing depth
comparing.barplot.fn(seq.depth.df,
seq.depth.df$Count,
"Sequencing Depth by Sample and Aligner",
"Count")

Generating DESeq2 objects
- vst() was run to perform variance stabilizing transformation instead of rlog() which takes longer time with similar characteristics.
- The vsd object created by vst() is used for not DE analysis but QC.
Estimating size factors
- black dashed line: size factor = 1
# Calculate and add size factors to the DEseq object
for (x in Aligners) {
ddsList[[x]] <- estimateSizeFactors(ddsList[[x]])
}
# Set a function summarizing size factors by aligner to a data frame
sfactor.fn <- function(df, aligner) {
sizefactor <- as.data.frame(round(sizeFactors(df), 3)) %>%
rownames_to_column(var="Sample") %>%
mutate(Aligner=aligner)
names(sizefactor) <- c("Sample", "Size_Factor", "Aligner")
return(sizefactor)
}
# Initialize a data frame with the first aligner
size.factor.df <- sfactor.fn(ddsList[[1]], Aligners[1])
for (x in Aligners) {
if (x != Aligners[1]) {
size.factor.df <- rbind(size.factor.df,
sfactor.fn(ddsList[[x]], x))
}
}
# Explore the data frame
print(size.factor.df)
## Sample Size_Factor Aligner
## 1 WT-rep1 1.110 Salmon
## 2 WT-rep2 0.998 Salmon
## 3 WT-rep3 0.980 Salmon
## 4 cKO-rep1 1.020 Salmon
## 5 cKO-rep2 1.034 Salmon
## 6 cKO-rep3 0.894 Salmon
## 7 WT-rep1 1.107 STAR
## 8 WT-rep2 1.000 STAR
## 9 WT-rep3 0.977 STAR
## 10 cKO-rep1 1.019 STAR
## 11 cKO-rep2 1.038 STAR
## 12 cKO-rep3 0.891 STAR
## 13 WT-rep1 1.107 HISAT2
## 14 WT-rep2 1.000 HISAT2
## 15 WT-rep3 0.976 HISAT2
## 16 cKO-rep1 1.019 HISAT2
## 17 cKO-rep2 1.038 HISAT2
## 18 cKO-rep3 0.891 HISAT2
# Convert character vectors to factors
size.factor.df$Sample <- factor(size.factor.df$Sample,
levels=SampleNames)
size.factor.df$Aligner <- factor(size.factor.df$Aligner,
levels=Aligners)
# Plot calculated size factors
comparing.barplot.fn(size.factor.df,
size.factor.df$Size_Factor,
"Size Factors by Aligner and Sample",
"Size Factor") + geom_hline(yintercept=1, linetype="dashed", color="black", size=1)

Estimating dispersion and Wald test
- Dispersion is calculated as a measure of variation instead of variance since variance gets larger when gene expression gets higher.
- Wald test is the default setting of DESeq2 which tests null hypothesis between two groups. You should use Likelihood ratio test (LRT) when comparing more than two groups.
Sample QC: Principal Component Analysis (PCA)
Identifies source of variation and sample outliers
# Assigne what to compare
GroupOfInterest <- Contrast[1]
# Set a function for sample pca
qcpca.fn <- function(obj, title) {
plotPCA(obj,
intgroup=GroupOfInterest,
returnData=FALSE) + theme_bw() + ggtitle(paste("PCA:", title))
}
# Print the plots
qcpca.fn(vsdList[[1]], Aligners[1])

qcpca.fn(vsdList[[2]], Aligners[2])

qcpca.fn(vsdList[[3]], Aligners[3])

Sample QC: Sample Correlation Heatmap
Identifies distance between samples & correlation in a group
# Heatmap annotation
HeatmapAnno <- metadata[, c("Sample", "Group")]
# Set a function generating a correlation heatmap
cheatmap.fn <- function(df, title) {
# Extract a normalized count matrix
vm <- assay(df)
# Generate a correlation matrix
cm <- cor(vm)
# Generate a heatmap
pheatmap(cm,
annotation=HeatmapAnno,
main=paste("Sample Correlation Heatmap:", title))
}
# Print the heatmaps
cheatmap.fn(vsdList[[1]], Aligners[1])

cheatmap.fn(vsdList[[2]], Aligners[2])

cheatmap.fn(vsdList[[3]], Aligners[3])

Running DE analysis
# Run DESeq
for (x in Aligners) {
ddsList[[x]] <- DESeq(ddsList[[x]])
# Check result names
ResNames <- resultsNames(ddsList[[x]])
print(ResNames)
}
## [1] "Intercept" "Group_cKO_vs_WT"
## [1] "Intercept" "Group_cKO_vs_WT"
## [1] "Intercept" "Group_cKO_vs_WT"
Creating dispersion plots
- Dispersion is important since estimation by DESeq2 algorithm is based on the assumption that genes with similar expression levels have similar dispersion. If an RNA-seq dataset doesn’t satisfy this assumption, use other DE algorithms than DESeq2.
Exploring distribution of false discovery rate (FDR)
Black dashed line: FDR = 0.1
# Plot distribution of FDR
ggplot(lfc.dataframe,
aes(x=padj, y=..count.., color=Alignment)) +
geom_density(size=1) +
theme_bw() +
scale_x_log10() +
ggtitle("Distribution of False Discovery Rate (FDR) by Aligner") +
ylab("Count") +
xlim(0.00001, 1) +
geom_vline(xintercept=alpha,
color="black",
size=1, linetype="dashed") + scale_x_continuous(breaks=seq(0, 1, by=0.1))

Presenting distribution of log2FoldChange
Black: total genes (padj =/= NA)
Colored: genes above or below FDR=0.1
valid.lfc.df <- subset(lfc.dataframe, FDR == "< 0.1")
ggplot(valid.lfc.df,
aes(x=log2FoldChange,
y=..count..,
color=Alignment)) +
geom_density(size=1) +
theme_bw() +
geom_vline(xintercept=c(-1, 1),
linetype="dashed", color="black", size=1) +
ggtitle("Distribution of log2FoldChange Values by Aligner (FDR < 0.1)") +
ylab("Count") +
xlim(-10, 10) # Change xlim by datatype

Exploring mean-difference with an MA plot
- x-axis: expression level (baseMean))
- y-axis: fold change (log2FoldChange)
- Red dashed lines: log2FoldChange = -1 and 1
# Set ylim: has to adjusted by users depending on data
yl <- c(-10, 10)
# Set min log2 fold change of interest
mLog <- c(-1, 1)
# Create MA plots by Aligner
ggplot(lfc.dataframe, aes(x=baseMean, y=log2FoldChange, color=FDR)) +
geom_point() +
facet_grid(~Alignment) +
scale_x_log10() +
theme_bw() +
scale_color_manual(values=c("blue", "grey")) +
ggtitle(paste("MA plot")) +
ylim(yl[1], yl[2]) +
theme(strip.text.x=element_text(size=10)) +
geom_hline(yintercept=c(mLog[1], mLog[2]),
linetype="dashed", color="red")

Exploring expression profiling with normalized count data
- Normalized count matrices are extracted from dds objects and filtered with thresholds set at FDR and log2FoldChange
- The heatmaps display z-scores of the normalized counts
- In this analysis, mLog = 1
# Initialize a list
heatmap.df.List <- lfcList
# Filter genes with FDR < alpha and absolute log2FoldChange > 1
for (x in Aligners) {
# Set a logical vector filtering FDR below alpha
is.fdr.valid <- lfcList[[x]]$FDR == paste("<", alpha)
# Set a logical vector filtering absolute lfc above 1
is.lfc.large <- abs(lfcList[[x]]$log2FoldChange) > mLog[2]
# Extract total normalized counts
norm.counts <- counts(ddsList[[x]], normalized=T)
# Save filtered genes only from the normalized count data
heatmap.df.List[[x]] <- norm.counts[is.fdr.valid & is.lfc.large,]
}
# Explore the cleaned data frames
head(heatmap.df.List[[1]])
## WT-rep1 WT-rep2 WT-rep3 cKO-rep1 cKO-rep2 cKO-rep3
## ENSMUSG00000000125 17.11371 17.03764 12.24046 19.617050 81.208977 48.112902
## ENSMUSG00000000303 23.41876 21.04649 40.80153 6.865968 4.833868 10.070142
## ENSMUSG00000000673 28.82309 29.06421 31.62119 11.770230 15.468377 11.189047
## ENSMUSG00000000686 85.56853 80.17712 84.66318 39.234100 35.770621 32.448236
## ENSMUSG00000000869 60.34833 64.14170 15.30058 22.559608 8.700962 6.713428
## ENSMUSG00000000957 24.31948 24.05314 15.30058 100.046956 164.351502 111.890470
head(heatmap.df.List[[2]])
## WT-rep1 WT-rep2 WT-rep3 cKO-rep1 cKO-rep2
## ENSMUSG00000025929 151.77009 57.97408 16.37872 7.848327 7.705361
## ENSMUSG00000041872 131.89543 62.97184 16.37872 14.715613 14.447553
## ENSMUSG00000047180 1469.82104 1545.30903 1586.68816 626.885122 470.027043
## ENSMUSG00000026072 49.68664 46.97899 57.32551 10.791450 5.779021
## ENSMUSG00000026070 542.93943 570.74480 901.85308 417.923415 242.718883
## ENSMUSG00000025969 33.42556 36.98346 30.71009 60.824535 78.016784
## cKO-rep3
## ENSMUSG00000025929 5.611470
## ENSMUSG00000041872 8.978352
## ENSMUSG00000047180 622.873185
## ENSMUSG00000026072 14.589822
## ENSMUSG00000026070 253.638450
## ENSMUSG00000025969 65.093054
head(heatmap.df.List[[3]])
## WT-rep1 WT-rep2 WT-rep3 cKO-rep1 cKO-rep2
## ENSMUSG00000025929 150.82592 57.99044 16.38913 7.853379 7.708966
## ENSMUSG00000041872 127.34404 58.99027 12.29185 13.743413 15.417932
## ENSMUSG00000047180 1458.58602 1537.74640 1576.42924 623.361945 468.319699
## ENSMUSG00000026072 49.67321 45.99242 57.36195 10.798396 5.781725
## ENSMUSG00000026070 525.63286 555.90832 872.72106 409.357372 235.123470
## ENSMUSG00000025969 34.31967 37.99373 32.77826 57.918669 82.871387
## cKO-rep3
## ENSMUSG00000025929 5.611066
## ENSMUSG00000041872 8.977705
## ENSMUSG00000047180 617.217229
## ENSMUSG00000026072 14.588771
## ENSMUSG00000026070 246.886892
## ENSMUSG00000025969 70.699428
dim(heatmap.df.List[[1]])
## [1] 237 6
dim(heatmap.df.List[[2]])
## [1] 236 6
dim(heatmap.df.List[[3]])
## [1] 240 6
pheatmap(heatmap.df.List[[3]],
annotation=HeatmapAnno,
scale="row",
show_rownames=F)

# Set a function creating a profiling heatmap
profile.heatmap.fn <- function(df, title) {
pheatmap(df,
annotation=HeatmapAnno,
scale="row",
show_rownames=F,
main=paste("Expression Profiling by", title, "(FDR < 0.1, absolute log2FoldChange > 1)"))
}
# Print the expression heatmaps
profile.heatmap.fn(heatmap.df.List[[Aligners[1]]], Aligners[1])

profile.heatmap.fn(heatmap.df.List[[Aligners[2]]], Aligners[2])

profile.heatmap.fn(heatmap.df.List[[Aligners[3]]], Aligners[3])

NA statistics: zero count genes & outlier genes
When NAs appear in
- log2FoldChange: zero counts in all samples
- padj: too little information
- pval & padj: at least one replicate was an outlier
# Count number of NA genes
type=c("Zero Counts", "Outliers", "Total NA Genes")
# Create a data frame storing number of NA genes by type
NA.genes <- lfc.dataframe %>%
group_by(Alignment) %>%
summarize(zero=sum(is.na(log2FoldChange)),
outlier=sum(is.na(pvalue) & is.na(padj))) %>%
mutate(total=zero + outlier) %>%
gather(Type, Number, -Alignment) %>%
mutate(Type=factor(case_when(Type == "zero" ~ type[1],
Type == "outlier" ~ type[2],
Type == "total" ~ type[3]),
levels=type))
# Plot number of NA genes
ggplot(NA.genes,
aes(x=Type, y=Number, group=Alignment, fill=Alignment, label=Number)) +
geom_bar(stat="identity", position="dodge") +
theme_bw() +
geom_text(position=position_dodge(width=1), vjust=1.5) +
ggtitle("Number of NA Genes") +
ylab("Number of Genes")

baseMean/LFC/FDR comparison between aligners
# Create a data frame storing the number of transcripts by gene id
AnnoDb.ntrans <- AnnoDb %>%
group_by(GENEID) %>%
summarize(num.trans=n_distinct(TXID))
# Create an empty list storing significant gene lfc tables
sigList <- list()
# Filter significant genes' lfc and save in the list
for (x in Aligners) {
sigList[[x]] <- subset(lfcList[[x]], FDR == paste("<", alpha))
}
# Create a vector storing column names
c_names <- colnames(sigList[[1]])[-1]
c_names <- c("GENEID",
paste0(c_names, "_", Aligners[1]),
paste0(c_names, "_", Aligners[2]),
paste0(c_names, "_", Aligners[3]))
# Join tables by GENEID
lfcTable <- sigList[[1]] %>%
inner_join(sigList[[2]], by="GENEID") %>%
inner_join(sigList[[3]], by="GENEID")
# Rename the columns
names(lfcTable) <- c_names
# Calculate differences
lfcTable <- lfcTable %>%
mutate(mean_SA_ST=baseMean_Salmon - baseMean_STAR,
mean_SA_HI=baseMean_Salmon - baseMean_HISAT2,
mean_ST_HI=baseMean_STAR - baseMean_HISAT2,
lfc_SA_ST=log2FoldChange_Salmon - log2FoldChange_STAR,
lfc_SA_HI=log2FoldChange_Salmon - log2FoldChange_HISAT2,
lfc_ST_HI=log2FoldChange_STAR - log2FoldChange_HISAT2,
FDR_SA_ST=padj_Salmon - padj_STAR,
FDR_SA_HI=padj_Salmon - padj_HISAT2,
FDR_ST_HI=padj_STAR - padj_HISAT2) %>%
left_join(AnnoDb.ntrans, by="GENEID")
# Set a function to create a vector storing plot labels
plotlabels.fn <- function(myvec, mylist, num) {
vec <- c()
for (i in 1:num) {
vec <- c(vec, c(myvec[i], rep("", nrow(mylist[[i]]) - 1)))
}
return(vec)
}
# Explore the output table
head(lfcTable)
## GENEID baseMean_Salmon log2FoldChange_Salmon lfcSE_Salmon
## 1 ENSMUSG00000000078 3182.57630 -0.2540106 0.07029216
## 2 ENSMUSG00000000085 574.54499 -0.3051097 0.11172615
## 3 ENSMUSG00000000125 32.55512 -1.6815257 0.54687724
## 4 ENSMUSG00000000142 567.90952 0.4233999 0.15963537
## 5 ENSMUSG00000000184 33671.54570 -0.2175147 0.08092099
## 6 ENSMUSG00000000275 4399.69082 -0.2347587 0.08326712
## stat_Salmon pvalue_Salmon padj_Salmon FDR_Salmon Alignment_Salmon
## 1 -3.613640 0.0003019283 0.009025866 < 0.1 Salmon
## 2 -2.730871 0.0063167266 0.078616294 < 0.1 Salmon
## 3 -3.074777 0.0021065986 0.035676802 < 0.1 Salmon
## 4 2.652294 0.0079946945 0.092354125 < 0.1 Salmon
## 5 -2.687988 0.0071883891 0.086318131 < 0.1 Salmon
## 6 -2.819344 0.0048121884 0.064485900 < 0.1 Salmon
## baseMean_STAR log2FoldChange_STAR lfcSE_STAR stat_STAR pvalue_STAR
## 1 3297.0264 -0.2457384 0.06789377 -3.619455 0.0002952245
## 2 609.0892 -0.2974915 0.10151662 -2.930471 0.0033844842
## 3 34.2459 -1.5923501 0.50259104 -3.168282 0.0015334277
## 4 554.3048 0.4313936 0.15684032 2.750527 0.0059499441
## 5 32580.6962 -0.2126708 0.07890771 -2.695184 0.0070349764
## 6 4550.3626 -0.2311471 0.08246804 -2.802869 0.0050650243
## padj_STAR FDR_STAR Alignment_STAR baseMean_HISAT2 log2FoldChange_HISAT2
## 1 0.008914826 < 0.1 STAR 3245.3292 -0.2448183
## 2 0.054157532 < 0.1 STAR 586.1192 -0.2902909
## 3 0.029873916 < 0.1 STAR 33.3595 -1.5804287
## 4 0.081968251 < 0.1 STAR 546.0067 0.4288078
## 5 0.092687424 < 0.1 STAR 32115.4478 -0.2137057
## 6 0.072494399 < 0.1 STAR 4473.6172 -0.2329771
## lfcSE_HISAT2 stat_HISAT2 pvalue_HISAT2 padj_HISAT2 FDR_HISAT2
## 1 0.06846413 -3.575863 0.0003490748 0.01051970 < 0.1
## 2 0.10679988 -2.718082 0.0065661506 0.09105713 < 0.1
## 3 0.50731877 -3.115258 0.0018378428 0.03526692 < 0.1
## 4 0.15500097 2.766485 0.0056664239 0.08250596 < 0.1
## 5 0.07922648 -2.697402 0.0069882806 0.09507324 < 0.1
## 6 0.08238260 -2.827989 0.0046841398 0.07161359 < 0.1
## Alignment_HISAT2 mean_SA_ST mean_SA_HI mean_ST_HI lfc_SA_ST
## 1 HISAT2 -114.450102 -62.7529172 51.6971844 -0.008272156
## 2 HISAT2 -34.544247 -11.5742133 22.9700335 -0.007618142
## 3 HISAT2 -1.690775 -0.8043754 0.8863992 -0.089175577
## 4 HISAT2 13.604757 21.9027794 8.2980223 -0.007993669
## 5 HISAT2 1090.849492 1556.0979011 465.2484087 -0.004843870
## 6 HISAT2 -150.671758 -73.9264127 76.7453451 -0.003611537
## lfc_SA_HI lfc_ST_HI FDR_SA_ST FDR_SA_HI FDR_ST_HI
## 1 -0.009192246 -0.0009200906 0.0001110396 -0.0014938334 -0.0016048730
## 2 -0.014818795 -0.0072006531 0.0244587618 -0.0124408368 -0.0368995986
## 3 -0.101096974 -0.0119213968 0.0058028863 0.0004098853 -0.0053930010
## 4 -0.005407920 0.0025857491 0.0103858733 0.0098481622 -0.0005377112
## 5 -0.003809007 0.0010348636 -0.0063692929 -0.0087551080 -0.0023858151
## 6 -0.001781560 0.0018299765 -0.0080084993 -0.0071276865 0.0008808128
## num.trans
## 1 3
## 2 14
## 3 1
## 4 7
## 5 8
## 6 3
dim(lfcTable)
## [1] 1199 35
my.param <- c("baseMean", "log2FoldChange", "padj")
# Slice and clean the data frame for input
lfcTable.comp <- lfcTable %>%
dplyr::select(GENEID, num.trans, starts_with(my.param)) %>%
gather(Category, Value, -GENEID, -num.trans) %>%
separate(Category, c("Metric", "Aligner"), sep="_") %>% pivot_wider(names_from=Aligner, values_from=Value) %>%
group_by(Metric) %>%
nest() %>%
# Calculate correlation coefficients between aligners by metric
mutate(salmon.star.corr=map_dbl(data, ~ cor(.x$Salmon, .x$STAR)),
salmon.hisat.corr=map_dbl(data, ~ cor(.x$Salmon, .x$HISAT2)),
star.hisat.corr=map_dbl(data, ~ cor(.x$STAR, .x$HISAT2)))
# Explore the cleaned data frame
lfcTable.comp
## # A tibble: 3 x 5
## # Groups: Metric [3]
## Metric data salmon.star.corr salmon.hisat.co… star.hisat.corr
## <chr> <list> <dbl> <dbl> <dbl>
## 1 baseMean <tibble [1,199… 0.980 0.986 0.998
## 2 log2FoldCha… <tibble [1,199… 0.996 0.996 0.999
## 3 padj <tibble [1,199… 0.855 0.854 0.956
lfcTable.comp$data
## [[1]]
## # A tibble: 1,199 x 5
## GENEID num.trans Salmon STAR HISAT2
## <chr> <int> <dbl> <dbl> <dbl>
## 1 ENSMUSG00000000078 3 3183. 3297. 3245.
## 2 ENSMUSG00000000085 14 575. 609. 586.
## 3 ENSMUSG00000000125 1 32.6 34.2 33.4
## 4 ENSMUSG00000000142 7 568. 554. 546.
## 5 ENSMUSG00000000184 8 33672. 32581. 32115.
## 6 ENSMUSG00000000275 3 4400. 4550. 4474.
## 7 ENSMUSG00000000278 1 2052. 1976. 1942.
## 8 ENSMUSG00000000303 3 17.8 18.5 17.9
## 9 ENSMUSG00000000308 10 177. 187. 180.
## 10 ENSMUSG00000000409 8 16510. 17117. 16661.
## # … with 1,189 more rows
##
## [[2]]
## # A tibble: 1,199 x 5
## GENEID num.trans Salmon STAR HISAT2
## <chr> <int> <dbl> <dbl> <dbl>
## 1 ENSMUSG00000000078 3 -0.254 -0.246 -0.245
## 2 ENSMUSG00000000085 14 -0.305 -0.297 -0.290
## 3 ENSMUSG00000000125 1 -1.68 -1.59 -1.58
## 4 ENSMUSG00000000142 7 0.423 0.431 0.429
## 5 ENSMUSG00000000184 8 -0.218 -0.213 -0.214
## 6 ENSMUSG00000000275 3 -0.235 -0.231 -0.233
## 7 ENSMUSG00000000278 1 -0.383 -0.396 -0.390
## 8 ENSMUSG00000000303 3 1.98 1.97 1.90
## 9 ENSMUSG00000000308 10 -0.903 -0.866 -0.901
## 10 ENSMUSG00000000409 8 0.214 0.219 0.218
## # … with 1,189 more rows
##
## [[3]]
## # A tibble: 1,199 x 5
## GENEID num.trans Salmon STAR HISAT2
## <chr> <int> <dbl> <dbl> <dbl>
## 1 ENSMUSG00000000078 3 0.00903 0.00891 0.0105
## 2 ENSMUSG00000000085 14 0.0786 0.0542 0.0911
## 3 ENSMUSG00000000125 1 0.0357 0.0299 0.0353
## 4 ENSMUSG00000000142 7 0.0924 0.0820 0.0825
## 5 ENSMUSG00000000184 8 0.0863 0.0927 0.0951
## 6 ENSMUSG00000000275 3 0.0645 0.0725 0.0716
## 7 ENSMUSG00000000278 1 0.000491 0.000205 0.000292
## 8 ENSMUSG00000000303 3 0.0173 0.00697 0.0112
## 9 ENSMUSG00000000308 10 0.00127 0.000285 0.000396
## 10 ENSMUSG00000000409 8 0.00361 0.00331 0.00509
## # … with 1,189 more rows
# Prep vectors storing the correlation coefficient without duplication by comparison
Rsquared.salmon.star <- plotlabels.fn(lfcTable.comp$salmon.star.corr,
lfcTable.comp$data,
nrow(lfcTable.comp))
Rsquared.salmon.hisat <- plotlabels.fn(lfcTable.comp$salmon.hisat.corr,
lfcTable.comp$data,
nrow(lfcTable.comp))
Rsquared.star.hisat <- plotlabels.fn(lfcTable.comp$star.hisat.corr,
lfcTable.comp$data,
nrow(lfcTable.comp))
# Unnest the data frame and add columns storing correlation coefficients each
lfcTable.comp <- lfcTable.comp %>%
unnest(data)
head(lfcTable.comp)
## # A tibble: 6 x 9
## # Groups: Metric [1]
## Metric GENEID num.trans Salmon STAR HISAT2 salmon.star.corr salmon.hisat.co…
## <chr> <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 baseM… ENSMU… 3 3.18e3 3.30e3 3.25e3 0.980 0.986
## 2 baseM… ENSMU… 14 5.75e2 6.09e2 5.86e2 0.980 0.986
## 3 baseM… ENSMU… 1 3.26e1 3.42e1 3.34e1 0.980 0.986
## 4 baseM… ENSMU… 7 5.68e2 5.54e2 5.46e2 0.980 0.986
## 5 baseM… ENSMU… 8 3.37e4 3.26e4 3.21e4 0.980 0.986
## 6 baseM… ENSMU… 3 4.40e3 4.55e3 4.47e3 0.980 0.986
## # … with 1 more variable: star.hisat.corr <dbl>
lfcTable.comp$Rsquared.salmon.star <- Rsquared.salmon.star
lfcTable.comp$Rsquared.salmon.hisat <- Rsquared.salmon.hisat
lfcTable.comp$Rsquared.star.hisat <- Rsquared.star.hisat
# Explore the data frame
head(lfcTable.comp)
## # A tibble: 6 x 12
## # Groups: Metric [1]
## Metric GENEID num.trans Salmon STAR HISAT2 salmon.star.corr salmon.hisat.co…
## <chr> <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 baseM… ENSMU… 3 3.18e3 3.30e3 3.25e3 0.980 0.986
## 2 baseM… ENSMU… 14 5.75e2 6.09e2 5.86e2 0.980 0.986
## 3 baseM… ENSMU… 1 3.26e1 3.42e1 3.34e1 0.980 0.986
## 4 baseM… ENSMU… 7 5.68e2 5.54e2 5.46e2 0.980 0.986
## 5 baseM… ENSMU… 8 3.37e4 3.26e4 3.21e4 0.980 0.986
## 6 baseM… ENSMU… 3 4.40e3 4.55e3 4.47e3 0.980 0.986
## # … with 4 more variables: star.hisat.corr <dbl>, Rsquared.salmon.star <chr>,
## # Rsquared.salmon.hisat <chr>, Rsquared.star.hisat <chr>
# Nest the data frame by Metric
lfcTable.comp <- lfcTable.comp %>%
group_by(Metric) %>%
nest()
# Explore the data frame
lfcTable.comp
## # A tibble: 3 x 2
## # Groups: Metric [3]
## Metric data
## <chr> <list>
## 1 baseMean <tibble [1,199 × 11]>
## 2 log2FoldChange <tibble [1,199 × 11]>
## 3 padj <tibble [1,199 × 11]>
lfcTable.comp$data
## [[1]]
## # A tibble: 1,199 x 11
## GENEID num.trans Salmon STAR HISAT2 salmon.star.corr salmon.hisat.co…
## <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ENSMUSG000… 3 3183. 3.30e3 3.25e3 0.980 0.986
## 2 ENSMUSG000… 14 575. 6.09e2 5.86e2 0.980 0.986
## 3 ENSMUSG000… 1 32.6 3.42e1 3.34e1 0.980 0.986
## 4 ENSMUSG000… 7 568. 5.54e2 5.46e2 0.980 0.986
## 5 ENSMUSG000… 8 33672. 3.26e4 3.21e4 0.980 0.986
## 6 ENSMUSG000… 3 4400. 4.55e3 4.47e3 0.980 0.986
## 7 ENSMUSG000… 1 2052. 1.98e3 1.94e3 0.980 0.986
## 8 ENSMUSG000… 3 17.8 1.85e1 1.79e1 0.980 0.986
## 9 ENSMUSG000… 10 177. 1.87e2 1.80e2 0.980 0.986
## 10 ENSMUSG000… 8 16510. 1.71e4 1.67e4 0.980 0.986
## # … with 1,189 more rows, and 4 more variables: star.hisat.corr <dbl>,
## # Rsquared.salmon.star <chr>, Rsquared.salmon.hisat <chr>,
## # Rsquared.star.hisat <chr>
##
## [[2]]
## # A tibble: 1,199 x 11
## GENEID num.trans Salmon STAR HISAT2 salmon.star.corr salmon.hisat.co…
## <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ENSMUSG0000… 3 -0.254 -0.246 -0.245 0.996 0.996
## 2 ENSMUSG0000… 14 -0.305 -0.297 -0.290 0.996 0.996
## 3 ENSMUSG0000… 1 -1.68 -1.59 -1.58 0.996 0.996
## 4 ENSMUSG0000… 7 0.423 0.431 0.429 0.996 0.996
## 5 ENSMUSG0000… 8 -0.218 -0.213 -0.214 0.996 0.996
## 6 ENSMUSG0000… 3 -0.235 -0.231 -0.233 0.996 0.996
## 7 ENSMUSG0000… 1 -0.383 -0.396 -0.390 0.996 0.996
## 8 ENSMUSG0000… 3 1.98 1.97 1.90 0.996 0.996
## 9 ENSMUSG0000… 10 -0.903 -0.866 -0.901 0.996 0.996
## 10 ENSMUSG0000… 8 0.214 0.219 0.218 0.996 0.996
## # … with 1,189 more rows, and 4 more variables: star.hisat.corr <dbl>,
## # Rsquared.salmon.star <chr>, Rsquared.salmon.hisat <chr>,
## # Rsquared.star.hisat <chr>
##
## [[3]]
## # A tibble: 1,199 x 11
## GENEID num.trans Salmon STAR HISAT2 salmon.star.corr salmon.hisat.co…
## <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ENSMUSG0… 3 9.03e-3 8.91e-3 1.05e-2 0.855 0.854
## 2 ENSMUSG0… 14 7.86e-2 5.42e-2 9.11e-2 0.855 0.854
## 3 ENSMUSG0… 1 3.57e-2 2.99e-2 3.53e-2 0.855 0.854
## 4 ENSMUSG0… 7 9.24e-2 8.20e-2 8.25e-2 0.855 0.854
## 5 ENSMUSG0… 8 8.63e-2 9.27e-2 9.51e-2 0.855 0.854
## 6 ENSMUSG0… 3 6.45e-2 7.25e-2 7.16e-2 0.855 0.854
## 7 ENSMUSG0… 1 4.91e-4 2.05e-4 2.92e-4 0.855 0.854
## 8 ENSMUSG0… 3 1.73e-2 6.97e-3 1.12e-2 0.855 0.854
## 9 ENSMUSG0… 10 1.27e-3 2.85e-4 3.96e-4 0.855 0.854
## 10 ENSMUSG0… 8 3.61e-3 3.31e-3 5.09e-3 0.855 0.854
## # … with 1,189 more rows, and 4 more variables: star.hisat.corr <dbl>,
## # Rsquared.salmon.star <chr>, Rsquared.salmon.hisat <chr>,
## # Rsquared.star.hisat <chr>
# Set a function creating a scatter plot
comp.scatter.fn <- function(df,
xvar,
yvar,
label,
metric,
xlab, ylab,
xlog=F,
ylog=F) {
p <- ggplot(df, aes(x=xvar, y=yvar, color=log(num.trans), label=label)) +
geom_point(alpha=0.5) +
theme_bw() +
geom_text(size=5,
mapping=aes(x=Inf, y=Inf),
vjust=2, hjust=1.2, color="black") +
geom_abline(slope=1, size=0.5, linetype="dashed", color="black") +
ggtitle(paste("Comparison in", metric, "\n(with R-Squared)")) + scale_color_gradient(low="blue", high="red") +
xlab(xlab) +
ylab(ylab)
if (xlog) {
p <- p +
scale_x_log10()
}
if (ylog) {
p <- p + scale_y_log10()
}
return(p)
}
# Create and add the scatter plots with or without logarithmic transformation of
# x/y scales
lfcTable.comp <- lfcTable.comp %>%
# Add scatter plots to the data frame:
#
# without log transformation of x and y scales
mutate(salmon.star=map(data,
~ comp.scatter.fn(.x, .x$Salmon, .x$STAR,
.x$Rsquared.salmon.star, Metric,
"Salmon", "STAR")),
salmon.hisat=map(data, ~ comp.scatter.fn(.x,
.x$Salmon,
.x$HISAT2,
.x$Rsquared.salmon.hisat,
Metric,
"Salmon", "HISAT2")),
star.hisat=map(data, ~ comp.scatter.fn(.x,
.x$STAR,
.x$HISAT2,
.x$Rsquared.star.hisat,
Metric,
"STAR", "HISAT2")),
# with log transformation of x and y scales
salmon.star.log=map(data,
~ comp.scatter.fn(.x, .x$Salmon, .x$STAR,
.x$Rsquared.salmon.star, Metric,
"Salmon", "STAR",
T, T)),
salmon.hisat.log=map(data, ~ comp.scatter.fn(.x,
.x$Salmon,
.x$HISAT2,
.x$Rsquared.salmon.hisat,
Metric,
"Salmon", "HISAT2",
T, T)),
star.hisat.log=map(data, ~ comp.scatter.fn(.x,
.x$STAR,
.x$HISAT2,
.x$Rsquared.star.hisat,
Metric,
"STAR", "HISAT2",
T, T)))
# Set a function simplifying grid.arrange() function
gr.ar.fn0 <- function(pl1, pl2, pl3) {
grid.arrange(pl1, pl2, pl3, ncol=1)
}
# Print the plots
for (i in 1:length(my.param)) {
p1 <- gr.ar.fn0(lfcTable.comp$salmon.star[[i]],
lfcTable.comp$salmon.hisat[[i]],
lfcTable.comp$star.hisat[[i]])
p2 <- gr.ar.fn0(lfcTable.comp$salmon.star.log[[i]],
lfcTable.comp$salmon.hisat.log[[i]],
lfcTable.comp$star.hisat.log[[i]])
print(grid.arrange(p1, p2, nrow=1))
}



## TableGrob (1 x 2) "arrange": 2 grobs
## z cells name grob
## 1 1 (1-1,1-1) arrange gtable[arrange]
## 2 2 (1-1,2-2) arrange gtable[arrange]



## TableGrob (1 x 2) "arrange": 2 grobs
## z cells name grob
## 1 1 (1-1,1-1) arrange gtable[arrange]
## 2 2 (1-1,2-2) arrange gtable[arrange]



## TableGrob (1 x 2) "arrange": 2 grobs
## z cells name grob
## 1 1 (1-1,1-1) arrange gtable[arrange]
## 2 2 (1-1,2-2) arrange gtable[arrange]
Relationship between baseMean/log2FoldChange/padj with the number of alternative transcripts
- Variables of interest: baseMean, log2FoldChange, and padj
- R-squared: tests whether the two variables are correlated
- None of the variable were correlated with the number of alternative transcripts in any of the aligners
- ANOVA: tests whether the the variable is significantly different by Aligner
# Slice the data frame
lfcTable.rel <- lfcTable %>%
dplyr::select(starts_with(c(my.param, "num.")))
# Set a function cleaning my data frame
gather.fn <- function(word) {
gather(lfcTable.rel, Group, Value, starts_with(word)) %>%
dplyr::select(Group, Value, num.trans)
}
# Set a function creating a boxplot
boxplot1.fn <- function(metric, ylog=F) {
df <- subset(lfcTable.rela, Metric == metric)
df$Aligner <- factor(df$Aligner, levels=Aligners)
p <- ggplot(df,
aes(x= Aligner, y=Value, fill=Aligner, label=ANOVA.pval)) +
geom_boxplot(alpha=0.5) +
theme_bw() +
xlab("Aligner") +
ylab("Value") +
theme(strip.text.x=element_text(size=10)) +
ggtitle(paste("Aligner Comparison in",
metric,
"\n(P-value from Two-Tailed ANOVA Test)")) + ylab(metric) +
geom_text(size=5, mapping=aes(x=Inf, y=Inf), vjust=2, hjust=1.2)
if (ylog) {
p <- p + scale_y_log10()
}
return(p)
}
# Clean the data frame
lfcTable.rela <- rbind(gather.fn("base"),
gather.fn("log2"),
gather.fn("padj")) %>%
separate(Group, c("Metric", "Aligner"), sep="_") %>%
# Nest by Metric
group_by(Metric) %>%
nest() %>%
# Add columns storing:
# 1. linear regression model
# 2. summary of the model
# 3. R-squared extracted from the model
# 4. ANOVA test
# 5. p-values extracted from the ANOVA objects
mutate(Model=map(data, ~ lm(Value ~ num.trans, data=.x)),
Model_Summary=map(Model, ~ summary(.x)),
Rsquared=map_dbl(Model_Summary, ~ .x$r.squared),
ANOVA=map(data, ~ unlist(summary(aov(Value ~ Aligner, data=.x)))),
ANOVA.pval=map_dbl(ANOVA, ~ round(.x["Pr(>F)1"], 9)))
# Create a vector storing p-values which will be used in plotting
ANOVA.pval <- plotlabels.fn(lfcTable.rela$ANOVA.pval,
lfcTable.rela$data,
nrow(lfcTable.rela))
# Unnest
lfcTable.rela <- lfcTable.rela %>%
unnest(data)
# Add a column storing previously created p-value vector
lfcTable.rela$ANOVA.pval <- ANOVA.pval
# Plot
boxplot1.fn("baseMean", T)

boxplot1.fn("log2FoldChange", T)

boxplot1.fn("padj", F)

Difference comparison in baseMean/LFC/FDR between Aligners
# Assign names of the list
col.of.int <- c("Subtraction", "Difference")
# Slice and clean the data frame
lfcTable.diff <- dplyr::select(lfcTable,
starts_with(c("mean_",
"lfc_",
"FDR_"))) %>%
dplyr::select(-ends_with(c("Salmon", "STAR", "HISAT2")))
# Create an empty data frame
lfcTable.sub <- data.frame()
# Clean and save in the data frame
for (x in c("mean_", "lfc_", "FDR_")) {
df <- lfcTable.diff %>%
gather(Subtraction, Difference, starts_with(x)) %>%
dplyr::select(col.of.int)
if (x == "mean_") {
lfcTable.sub <- df
} else {
lfcTable.sub <- rbind(lfcTable.sub, df)
}
}
# Explore the output
head(lfcTable.sub)
## Subtraction Difference
## 1 mean_SA_ST -114.450102
## 2 mean_SA_ST -34.544247
## 3 mean_SA_ST -1.690775
## 4 mean_SA_ST 13.604757
## 5 mean_SA_ST 1090.849492
## 6 mean_SA_ST -150.671758
# Rearrange the data frame
lfcTable.sub <- lfcTable.sub %>%
separate(Subtraction, c("Metric", "From", "To"), sep="_") %>%
mutate(Metric=ifelse(Metric == "mean", "baseMean",
ifelse(Metric == "lfc", "log2FoldChange", "padj")),
From=ifelse(From == "SA", "Salmon",
ifelse(From == "ST", "STAR", "HISAT2")),
To=ifelse(To == "SA", "Salmon",
ifelse(To == "ST", "STAR", "HISAT2")),
Subtraction=paste(From, "-", To)) %>%
# Nest by Metric
group_by(Metric) %>%
nest()
# Conduct ANOVA test in the subtraction data
lfcTable.sub <- lfcTable.sub %>%
mutate(ANOVA=map(data, ~aov(Difference ~ Subtraction, data=.x)),
ANOVA.summary=map(ANOVA, ~ unlist(summary(.x))),
ANOVA.pval=map_dbl(ANOVA.summary, ~ .x["Pr(>F)1"]))
# Create a vector storing pvalue labels
sub.ANOVA.pval <- plotlabels.fn(lfcTable.sub$ANOVA.pval,
lfcTable.sub$data,
nrow(lfcTable.sub))
# Unnest the data frame
lfcTable.sub <- lfcTable.sub %>%
unnest(data)
# Add a column storing pvalues
lfcTable.sub$ANOVA.pval <- sub.ANOVA.pval
# Set a function creating a box plot
boxplot2.fn <- function(metric, ylog=F) {
df <- subset(lfcTable.sub, Metric == metric)
p <- ggplot(df,
aes(x= Subtraction, y=abs(Difference),
fill=Subtraction, label=ANOVA.pval)) +
geom_boxplot(alpha=0.5) +
theme_bw() +
xlab("Subtraction") +
ylab("Difference") +
theme(strip.text.x=element_text(size=10)) +
ggtitle(paste("Subtraction Comparison in",
metric,
"\n(P-value from Two-Tailed ANOVA Test)")) +
geom_text(size=5, mapping=aes(x=Inf, y=Inf), vjust=2, hjust=1.2)
if (ylog) {
p <- p + scale_y_log10()
}
return(p)
}
# Print the boxplots comparing subtraction & statistical significance by metric
boxplot2.fn("baseMean", T)

boxplot2.fn("log2FoldChange", T)

boxplot2.fn("padj", F)

Exploration of standard deviation (SD) of baseMean, log2FoldChange, and padj across the alingers
# Create an empty data frame
lfcTable.sd <- data.frame()
for (x in my.param) {
# Slice the data frame with columns of interest
df <- dplyr::select(lfcTable,
starts_with(x))
# Calculate Mean across the aligners
df$Mean <- rowMeans(df)
# Calculate SD across the aligners
df$SD <- apply(df[, 1:3], 1, sd)
# Add a column storing the number of alternative transcripts
df$num.trans <- lfcTable$num.trans
# Change column names
colnames(df) <- str_replace(colnames(df), paste0(x, "_"), "")
# Add a column storing metric
df$Metric <- x
# Save as a data frame
if (x == my.param[1]) {
lfcTable.sd <- df
} else {
lfcTable.sd <- rbind(lfcTable.sd, df)
}
}
# Explore the output data frame
head(lfcTable.sd)
## Salmon STAR HISAT2 Mean SD num.trans Metric
## 1 3182.57630 3297.0264 3245.3292 3241.64397 57.3139792 3 baseMean
## 2 574.54499 609.0892 586.1192 589.91781 17.5826138 14 baseMean
## 3 32.55512 34.2459 33.3595 33.38684 0.8457188 1 baseMean
## 4 567.90952 554.3048 546.0067 556.07367 11.0580162 7 baseMean
## 5 33671.54570 32580.6962 32115.4478 32789.22991 798.7333070 8 baseMean
## 6 4399.69082 4550.3626 4473.6172 4474.55688 75.3402737 3 baseMean
dim(lfcTable.sd)
## [1] 3597 7
# Nest the data frame by Metric
lfcTable.sd <- lfcTable.sd %>%
group_by(Metric) %>%
nest()
# Explore the output data frame
lfcTable.sd
## # A tibble: 3 x 2
## # Groups: Metric [3]
## Metric data
## <chr> <list>
## 1 baseMean <tibble [1,199 × 6]>
## 2 log2FoldChange <tibble [1,199 × 6]>
## 3 padj <tibble [1,199 × 6]>
# Set a function creating a scatter plot demonstrating SD over the number of alternative transcripts
scatter1.fn <- function(df, mlog=F, ylog=F, metric) {
if (mlog) {
p <- ggplot(df, aes(x=num.trans, y=SD, color=log(Mean)))
} else {
p <- ggplot(df, aes(x=num.trans, y=SD, color=Mean))
}
p <- p +
geom_point(alpha=0.5) +
theme_bw() +
xlab("Number of Transcripts") +
ggtitle(paste("Relationship between",
metric,
"Standard Deviation and\nNumber of Transcripts")) +
scale_color_gradient(low="blue", high="red")
if (ylog) {
p <- p +
scale_y_log10() +
ylab("Log (Standard Deviation)")
} else {
p <- p + ylab("Standard Deviation")
}
return(p)
}
# Create and save the scatter plots
lfcTable.sd <- lfcTable.sd %>%
# sd.plot = No log transformation in the plot
# sd.plot.logMean = log(Mean)
# sd.plot.logY = log(SD),
# sd.plot.logBoth = log(Mean) & log(SD)
mutate(sd.plot=map(data, ~ scatter1.fn(.x, F, F, Metric)),
sd.plot.logMean=map(data, ~ scatter1.fn(.x, T, F, Metric)),
sd.plot.logY=map(data, ~ scatter1.fn(.x, F, T, Metric)),
sd.plot.logBoth=map(data, ~ scatter1.fn(.x, T, T, Metric)))
# Set a function shortening scripts
gr.ar.fn2 <- function(pl1, pl2, pl3) {
grid.arrange(pl1, pl2, pl3, nrow=1)
}
# Explore the plots
gr.ar.fn2(lfcTable.sd$sd.plot[[1]],
lfcTable.sd$sd.plot[[2]],
lfcTable.sd$sd.plot[[3]])

gr.ar.fn2(lfcTable.sd$sd.plot.logMean[[1]],
lfcTable.sd$sd.plot.logMean[[2]],
lfcTable.sd$sd.plot.logMean[[3]])

gr.ar.fn2(lfcTable.sd$sd.plot.logY[[1]],
lfcTable.sd$sd.plot.logY[[2]],
lfcTable.sd$sd.plot.logY[[3]])

gr.ar.fn2(lfcTable.sd$sd.plot.logBoth[[1]],
lfcTable.sd$sd.plot.logBoth[[2]],
lfcTable.sd$sd.plot.logBoth[[3]])

# Print the optimal presentation
gr.ar.fn2(lfcTable.sd$sd.plot.logBoth[[1]],
lfcTable.sd$sd.plot.logBoth[[2]],
lfcTable.sd$sd.plot[[3]])

Exploring relationship between difference/percent difference vs the number of alternative transcripts
# Add columns storing percent difference
lfcTable <- lfcTable %>%
# mDiff: difference in meanBase
# lDiff: difference in log2FoldChange
# fDiff: difference in padj (FDR)
mutate(mDiff_percent=100 * mean_SA_ST / (baseMean_Salmon + baseMean_STAR),
lDiff_percent=100 * lfc_SA_ST / (abs(log2FoldChange_Salmon) + abs(log2FoldChange_STAR)),
fDiff_percent=100 * FDR_SA_ST / (padj_Salmon + padj_STAR))
# Explore the output
head(lfcTable)
## GENEID baseMean_Salmon log2FoldChange_Salmon lfcSE_Salmon
## 1 ENSMUSG00000000078 3182.57630 -0.2540106 0.07029216
## 2 ENSMUSG00000000085 574.54499 -0.3051097 0.11172615
## 3 ENSMUSG00000000125 32.55512 -1.6815257 0.54687724
## 4 ENSMUSG00000000142 567.90952 0.4233999 0.15963537
## 5 ENSMUSG00000000184 33671.54570 -0.2175147 0.08092099
## 6 ENSMUSG00000000275 4399.69082 -0.2347587 0.08326712
## stat_Salmon pvalue_Salmon padj_Salmon FDR_Salmon Alignment_Salmon
## 1 -3.613640 0.0003019283 0.009025866 < 0.1 Salmon
## 2 -2.730871 0.0063167266 0.078616294 < 0.1 Salmon
## 3 -3.074777 0.0021065986 0.035676802 < 0.1 Salmon
## 4 2.652294 0.0079946945 0.092354125 < 0.1 Salmon
## 5 -2.687988 0.0071883891 0.086318131 < 0.1 Salmon
## 6 -2.819344 0.0048121884 0.064485900 < 0.1 Salmon
## baseMean_STAR log2FoldChange_STAR lfcSE_STAR stat_STAR pvalue_STAR
## 1 3297.0264 -0.2457384 0.06789377 -3.619455 0.0002952245
## 2 609.0892 -0.2974915 0.10151662 -2.930471 0.0033844842
## 3 34.2459 -1.5923501 0.50259104 -3.168282 0.0015334277
## 4 554.3048 0.4313936 0.15684032 2.750527 0.0059499441
## 5 32580.6962 -0.2126708 0.07890771 -2.695184 0.0070349764
## 6 4550.3626 -0.2311471 0.08246804 -2.802869 0.0050650243
## padj_STAR FDR_STAR Alignment_STAR baseMean_HISAT2 log2FoldChange_HISAT2
## 1 0.008914826 < 0.1 STAR 3245.3292 -0.2448183
## 2 0.054157532 < 0.1 STAR 586.1192 -0.2902909
## 3 0.029873916 < 0.1 STAR 33.3595 -1.5804287
## 4 0.081968251 < 0.1 STAR 546.0067 0.4288078
## 5 0.092687424 < 0.1 STAR 32115.4478 -0.2137057
## 6 0.072494399 < 0.1 STAR 4473.6172 -0.2329771
## lfcSE_HISAT2 stat_HISAT2 pvalue_HISAT2 padj_HISAT2 FDR_HISAT2
## 1 0.06846413 -3.575863 0.0003490748 0.01051970 < 0.1
## 2 0.10679988 -2.718082 0.0065661506 0.09105713 < 0.1
## 3 0.50731877 -3.115258 0.0018378428 0.03526692 < 0.1
## 4 0.15500097 2.766485 0.0056664239 0.08250596 < 0.1
## 5 0.07922648 -2.697402 0.0069882806 0.09507324 < 0.1
## 6 0.08238260 -2.827989 0.0046841398 0.07161359 < 0.1
## Alignment_HISAT2 mean_SA_ST mean_SA_HI mean_ST_HI lfc_SA_ST
## 1 HISAT2 -114.450102 -62.7529172 51.6971844 -0.008272156
## 2 HISAT2 -34.544247 -11.5742133 22.9700335 -0.007618142
## 3 HISAT2 -1.690775 -0.8043754 0.8863992 -0.089175577
## 4 HISAT2 13.604757 21.9027794 8.2980223 -0.007993669
## 5 HISAT2 1090.849492 1556.0979011 465.2484087 -0.004843870
## 6 HISAT2 -150.671758 -73.9264127 76.7453451 -0.003611537
## lfc_SA_HI lfc_ST_HI FDR_SA_ST FDR_SA_HI FDR_ST_HI
## 1 -0.009192246 -0.0009200906 0.0001110396 -0.0014938334 -0.0016048730
## 2 -0.014818795 -0.0072006531 0.0244587618 -0.0124408368 -0.0368995986
## 3 -0.101096974 -0.0119213968 0.0058028863 0.0004098853 -0.0053930010
## 4 -0.005407920 0.0025857491 0.0103858733 0.0098481622 -0.0005377112
## 5 -0.003809007 0.0010348636 -0.0063692929 -0.0087551080 -0.0023858151
## 6 -0.001781560 0.0018299765 -0.0080084993 -0.0071276865 0.0008808128
## num.trans mDiff_percent lDiff_percent fDiff_percent
## 1 3 -1.766314 -1.6552621 0.6189257
## 2 14 -2.918490 -1.2642096 18.4213730
## 3 1 -2.531061 -2.7238534 8.8525137
## 4 7 1.212314 -0.9351579 5.9578544
## 5 8 1.646510 -1.1259958 -3.5581538
## 6 3 -1.683473 -0.7751646 -5.8464607
dim(lfcTable)
## [1] 1199 38
# Create an empty data frame
lfcTable.percent <- data.frame()
for (x in my.param) {
init.vec <- c("GENEID", "num.trans")
# Set columns of interest
if (x == my.param[1]) {
col.of.interest <- c(init.vec, "mean_SA_ST", "mDiff_percent")
} else if (x == my.param[2]) {
col.of.interest <- c(init.vec, "lfc_SA_ST", "lDiff_percent")
} else {
col.of.interest <- c(init.vec, "FDR_SA_ST", "fDiff_percent")
}
# Trim the table
df <- lfcTable[, colnames(lfcTable) %in% col.of.interest] %>%
mutate(Metric=x) %>%
dplyr::rename(Difference=ends_with("ST"),
Percent=ends_with("percent"))
# Save in the empty data frame
if (x == my.param[1]) {
lfcTable.percent <- df
} else {
lfcTable.percent <- rbind(lfcTable.percent, df)
}
}
# Explore the output
head(lfcTable.percent)
## GENEID Difference num.trans Percent Metric
## 1 ENSMUSG00000000078 -114.450102 3 -1.766314 baseMean
## 2 ENSMUSG00000000085 -34.544247 14 -2.918490 baseMean
## 3 ENSMUSG00000000125 -1.690775 1 -2.531061 baseMean
## 4 ENSMUSG00000000142 13.604757 7 1.212314 baseMean
## 5 ENSMUSG00000000184 1090.849492 8 1.646510 baseMean
## 6 ENSMUSG00000000275 -150.671758 3 -1.683473 baseMean
dim(lfcTable.percent)
## [1] 3597 5
# Nest the data frame by Metric
lfcTable.percent <- lfcTable.percent %>%
group_by(Metric) %>%
nest()
# Explore the output
lfcTable.percent
## # A tibble: 3 x 2
## # Groups: Metric [3]
## Metric data
## <chr> <list>
## 1 baseMean <tibble [1,199 × 4]>
## 2 log2FoldChange <tibble [1,199 × 4]>
## 3 padj <tibble [1,199 × 4]>
head(lfcTable.percent$data[[1]])
## # A tibble: 6 x 4
## GENEID Difference num.trans Percent
## <chr> <dbl> <int> <dbl>
## 1 ENSMUSG00000000078 -114. 3 -1.77
## 2 ENSMUSG00000000085 -34.5 14 -2.92
## 3 ENSMUSG00000000125 -1.69 1 -2.53
## 4 ENSMUSG00000000142 13.6 7 1.21
## 5 ENSMUSG00000000184 1091. 8 1.65
## 6 ENSMUSG00000000275 -151. 3 -1.68
# Calculate correlation (Rsquared)
# DvsP: Percent Difference vs Difference
# DvsT: Difference vs Number of Transcripts
# PvsT: Percent Difference vs Number of Transcripts
lfcTable.percent <- lfcTable.percent %>%
mutate(DvsP.mod=map(data, ~ lm(Percent ~ Difference, data=.x)),
DvsP.rsq=map_dbl(DvsP.mod, ~ summary(.x)$r.squared),
DvsT.mod=map(data, ~ lm(Difference ~ num.trans, data=.x)),
DvsT.rsq=map_dbl(DvsT.mod, ~ summary(.x)$r.squared),
PvsT.mod=map(data, ~ lm(Percent ~ num.trans, data=.x)),
PvsT.rsq=map_dbl(PvsT.mod, ~ summary(.x)$r.squared))
# Add Rsquard values to the data frame as plot label columns
DvsP.r <- plotlabels.fn(lfcTable.percent$DvsP.rsq,
lfcTable.percent$data,
nrow(lfcTable.percent))
DvsT.r <- plotlabels.fn(lfcTable.percent$DvsT.rsq,
lfcTable.percent$data,
nrow(lfcTable.percent))
PvsT.r <- plotlabels.fn(lfcTable.percent$PvsT.rsq,
lfcTable.percent$data,
nrow(lfcTable.percent))
lfcTable.percent <- lfcTable.percent %>%
unnest(data)
lfcTable.percent$DvsP.Rsquared <- DvsP.r
lfcTable.percent$DvsT.Rsquared <- DvsT.r
lfcTable.percent$PvsT.Rsquared <- PvsT.r
# Explore the output data frame
head(lfcTable.percent)
## # A tibble: 6 x 14
## # Groups: Metric [1]
## Metric GENEID Difference num.trans Percent DvsP.mod DvsP.rsq DvsT.mod DvsT.rsq
## <chr> <chr> <dbl> <int> <dbl> <list> <dbl> <list> <dbl>
## 1 baseM… ENSMU… -114. 3 -1.77 <lm> 0.155 <lm> 1.75e-5
## 2 baseM… ENSMU… -34.5 14 -2.92 <lm> 0.155 <lm> 1.75e-5
## 3 baseM… ENSMU… -1.69 1 -2.53 <lm> 0.155 <lm> 1.75e-5
## 4 baseM… ENSMU… 13.6 7 1.21 <lm> 0.155 <lm> 1.75e-5
## 5 baseM… ENSMU… 1091. 8 1.65 <lm> 0.155 <lm> 1.75e-5
## 6 baseM… ENSMU… -151. 3 -1.68 <lm> 0.155 <lm> 1.75e-5
## # … with 5 more variables: PvsT.mod <list>, PvsT.rsq <dbl>,
## # DvsP.Rsquared <chr>, DvsT.Rsquared <chr>, PvsT.Rsquared <chr>
# Set a function creating scatter plot
scatter2.fn <- function(label.col, xtitle, ytitle, title, ylog=F, xlog=F, label=F) {
p <- ggplot(lfcTable.percent) +
theme_bw() +
scale_color_gradient(low="blue", high="red") +
facet_wrap(~Metric, scales = "free") +
theme(strip.text.x=element_text(size=10)) +
ggtitle(paste(title, "in Salmon vs STAR")) +
xlab(xtitle) +
ylab(ytitle)
if (ylog) {
p <- p + scale_y_log10() + ylab(paste0("Log (", ytitle, ")"))
}
if (xlog) {
p <- p + scale_x_log10() + xlab(paste0("Log (", xtitle, ")"))
}
if (label) {
p <- p + geom_text(data=lfcTable.percent,
size=5,
mapping=aes(label=label.col, x=Inf, y=Inf),
vjust=2, hjust=1.1, color="black") +
ggtitle(paste(title, "in Salmon vs STAR", "(with R-Squared)"))
}
return(p)
}
# Create and save the plots
# P1 & 2: D vs P with or without log transformation of x values
# P3 & 4: P vs T with or without log transformation of y values
# P5 & 6: D vs T with or without log transformation of y values
p1 <- scatter2.fn(lfcTable.percent$DvsP.Rsquared,
"Salmon - STAR",
"100 x (Salmon - STAR) / (Salmon + STAR)",
"Difference vs Percent Difference", label=T) +
geom_point(data=lfcTable.percent, alpha=0.5,
aes(x=abs(Difference), y=abs(Percent), color=log(num.trans)))
p2 <- scatter2.fn(lfcTable.percent$DvsP.Rsquared,
"Salmon - STAR",
"100 x (Salmon - STAR) / (Salmon + STAR)",
"Difference vs Percent Difference",
xlog=T) +
geom_point(data=lfcTable.percent, alpha=0.5,
aes(x=abs(Difference), y=abs(Percent), color=log(num.trans)))
p3 <- scatter2.fn(lfcTable.percent$PvsT.Rsquared,
"Number of Alternative Transcripts",
"100 x (Salmon - STAR) / (Salmon + STAR)",
"Percent Difference vs Number of Alternative Transcripts",
label=T) +
geom_point(data=lfcTable.percent, alpha=0.5,
aes(x=num.trans, y=abs(Percent)))
p4 <- scatter2.fn(lfcTable.percent$PvsT.Rsquared,
"Number of Alternative Transcripts",
"100 x (Salmon - STAR) / (Salmon + STAR)",
"Percent Difference vs Number of Alternative Transcripts",
ylog=T) +
geom_point(data=lfcTable.percent, alpha=0.5,
aes(x=num.trans, y=abs(Percent)))
p5 <- scatter2.fn(lfcTable.percent$DvsT.Rsquared,
"Number of Alternative Transcripts",
"Salmon - STAR",
"Difference vs Number of Alternative Transcripts",
label=T) +
geom_point(data=lfcTable.percent, alpha=0.5,
aes(x=num.trans, y=abs(Difference)))
p6 <- scatter2.fn(lfcTable.percent$DvsT.Rsquared,
"Number of Alternative Transcripts",
"Salmon - STAR",
"Difference vs Number of Alternative Transcripts",
ylog=T) +
geom_point(data=lfcTable.percent, alpha=0.5,
aes(x=num.trans, y=abs(Difference)))
# Print the plots
grid.arrange(p1, p2, ncol=1)

grid.arrange(p3, p4, ncol=1)

grid.arrange(p5, p6, ncol=1)

Comparison of Ranking between Salmon and STAR in baseMean/log2FoldChange/FDR
# Set a function rearranging a data frame
rank.fn <- function(df, var, desc=F) {
if (desc) {
df <- dplyr::arrange(df, desc(var))
} else {
df <- dplyr::arrange(df, var)
}
return(df)
}
# Set a function creating a rank plot
rank.plot.fn <- function(df, metric, colog=F) {
if (colog) {
p <- ggplot(df, aes(x=Rank.x,
y=Rank.y,
color=log(MeanValue),
label=Rsquared))
} else {
p <- ggplot(df, aes(x=Rank.x,
y=Rank.y,
color=MeanValue,
label=Rsquared))
}
p <- p +
geom_point(alpha=0.5) +
theme_bw() +
scale_color_gradient(low="blue", high="red") +
xlab("Salmon") +
ylab("STAR") +
ggtitle(paste("Rank Comparison in", metric, "\n(with R-Squared)")) + geom_text(size=5,
mapping=aes(x=Inf, y=Inf), vjust=2, hjust=1.0, color="black") +
geom_abline(slope=1, size=0.5, linetype="dashed", color="black")
return(p)
}
# Clean the data frame
lfcTable.rank <- lfcTable %>%
# Splice by columns of interest
dplyr::select(GENEID,
starts_with(c("baseMean_", "log2FoldChange_", "padj_")),
-ends_with("HISAT2")) %>%
# Reform the table
gather(Metric, Value, -GENEID) %>%
# Nest by Metric
group_by(Metric) %>%
nest() %>%
# Add columns storing ascending or descending order by variable of interest
mutate(Ascending=map(data, ~ rank.fn(.x, .x$Value)),
Descending=map(data, ~ rank.fn(.x, .x$Value, T)),
Final=map2(Ascending,
Descending,
~ ifelse(Metric %in% c("padj_Salmon", "padj_STAR"),
Ascending,
Descending)),
Final=map(Final, ~ .x[[1]]),
# Add a column storing rank
Final=map(Final, ~ mutate(.x, Rank=1:nrow(.x)))) %>%
# Unnest
unnest(Final) %>%
# Reform the table
separate(Metric, c("Metric", "Aligner")) %>%
dplyr::select(-data, -Ascending, -Descending) %>%
# Renest by metric
group_by(Metric) %>%
nest() %>%
# Reclean the data frame
mutate(Split=map(data, ~ split(.x, .x$Aligner)),
Joined=map(Split, ~ inner_join(.x[[1]],
.x[[2]],
by=c("GENEID")))) %>%
dplyr::select(-data, -Split) %>%
# Unnest
unnest(Joined) %>%
# Calculate and save MeanRank and MeanValue
mutate(MeanRank=(Rank.x + Rank.y)/2,
MeanValue=abs((Value.x + Value.y)/2)) %>%
# Nest by Metric
nest(-Metric) %>%
# Add a column storing R^2
mutate(Rsquared=map_dbl(data, ~ cor(.x$Rank.x, .x$Rank.y)))
# Explore the data frame
head(lfcTable.rank)
## # A tibble: 3 x 3
## # Groups: Metric [3]
## Metric data Rsquared
## <chr> <list> <dbl>
## 1 baseMean <tibble [1,199 × 9]> 0.993
## 2 log2FoldChange <tibble [1,199 × 9]> 0.999
## 3 padj <tibble [1,199 × 9]> 0.926
head(lfcTable.rank$data)
## [[1]]
## # A tibble: 1,199 x 9
## Aligner.x GENEID Value.x Rank.x Aligner.y Value.y Rank.y MeanRank MeanValue
## <chr> <chr> <dbl> <int> <chr> <dbl> <int> <dbl> <dbl>
## 1 Salmon ENSMUSG… 72706. 1 STAR 76196. 1 1 74451.
## 2 Salmon ENSMUSG… 53884. 2 STAR 56130. 2 2 55007.
## 3 Salmon ENSMUSG… 53308. 3 STAR 56022. 3 3 54665.
## 4 Salmon ENSMUSG… 46759. 4 STAR 50141. 5 4.5 48450.
## 5 Salmon ENSMUSG… 43438. 5 STAR 44629. 6 5.5 44033.
## 6 Salmon ENSMUSG… 39293. 6 STAR 40984. 8 7 40138.
## 7 Salmon ENSMUSG… 39062. 7 STAR 41583. 7 7 40323.
## 8 Salmon ENSMUSG… 38475. 8 STAR 40024. 9 8.5 39249.
## 9 Salmon ENSMUSG… 34056. 9 STAR 35897. 10 9.5 34976.
## 10 Salmon ENSMUSG… 33672. 10 STAR 32581. 12 11 33126.
## # … with 1,189 more rows
##
## [[2]]
## # A tibble: 1,199 x 9
## Aligner.x GENEID Value.x Rank.x Aligner.y Value.y Rank.y MeanRank MeanValue
## <chr> <chr> <dbl> <int> <chr> <dbl> <int> <dbl> <dbl>
## 1 Salmon ENSMUSG… 3.82 1 STAR 4.16 1 1 3.99
## 2 Salmon ENSMUSG… 3.52 2 STAR 3.41 2 2 3.47
## 3 Salmon ENSMUSG… 3.00 3 STAR 2.81 4 3.5 2.90
## 4 Salmon ENSMUSG… 2.97 4 STAR 2.63 5 4.5 2.80
## 5 Salmon ENSMUSG… 2.80 5 STAR 2.91 3 4 2.85
## 6 Salmon ENSMUSG… 2.64 6 STAR 2.47 8 7 2.56
## 7 Salmon ENSMUSG… 2.59 7 STAR 2.61 6 6.5 2.60
## 8 Salmon ENSMUSG… 2.51 8 STAR 2.55 7 7.5 2.53
## 9 Salmon ENSMUSG… 2.50 9 STAR 2.31 12 10.5 2.41
## 10 Salmon ENSMUSG… 2.39 10 STAR 2.42 9 9.5 2.41
## # … with 1,189 more rows
##
## [[3]]
## # A tibble: 1,199 x 9
## Aligner.x GENEID Value.x Rank.x Aligner.y Value.y Rank.y MeanRank MeanValue
## <chr> <chr> <dbl> <int> <chr> <dbl> <int> <dbl> <dbl>
## 1 Salmon ENSMU… 3.34e-87 1 STAR 1.32e-93 1 1 1.67e-87
## 2 Salmon ENSMU… 1.89e-32 2 STAR 4.06e-39 2 2 9.45e-33
## 3 Salmon ENSMU… 5.65e-31 3 STAR 2.43e-33 3 3 2.84e-31
## 4 Salmon ENSMU… 1.25e-30 4 STAR 5.02e-31 4 4 8.74e-31
## 5 Salmon ENSMU… 3.48e-26 5 STAR 1.98e-26 5 5 2.73e-26
## 6 Salmon ENSMU… 8.12e-24 6 STAR 6.40e-25 6 6 4.38e-24
## 7 Salmon ENSMU… 3.55e-21 7 STAR 2.02e-18 13 10 1.01e-18
## 8 Salmon ENSMU… 5.19e-21 8 STAR 1.96e-24 7 7.5 2.60e-21
## 9 Salmon ENSMU… 3.83e-20 9 STAR 1.49e-22 8 8.5 1.92e-20
## 10 Salmon ENSMU… 1.57e-17 10 STAR 7.29e-21 9 9.5 7.85e-18
## # … with 1,189 more rows
# Create a vector storing R^2 values as an input of ggplot2()
Rsquared <- plotlabels.fn(lfcTable.rank$Rsquared,
lfcTable.rank$data,
nrow(lfcTable.rank))
# Unnest and add a column storing ggplot2's input R^2
lfcTable.rank <- lfcTable.rank %>%
unnest(data)
lfcTable.rank$Rsquared <- Rsquared
# Check out the new column added
head(lfcTable.rank)
## # A tibble: 6 x 11
## # Groups: Metric [1]
## Metric Aligner.x GENEID Value.x Rank.x Aligner.y Value.y Rank.y MeanRank
## <chr> <chr> <chr> <dbl> <int> <chr> <dbl> <int> <dbl>
## 1 baseMe… Salmon ENSMUSG000… 72706. 1 STAR 76196. 1 1
## 2 baseMe… Salmon ENSMUSG000… 53884. 2 STAR 56130. 2 2
## 3 baseMe… Salmon ENSMUSG000… 53308. 3 STAR 56022. 3 3
## 4 baseMe… Salmon ENSMUSG000… 46759. 4 STAR 50141. 5 4.5
## 5 baseMe… Salmon ENSMUSG000… 43438. 5 STAR 44629. 6 5.5
## 6 baseMe… Salmon ENSMUSG000… 39293. 6 STAR 40984. 8 7
## # … with 2 more variables: MeanValue <dbl>, Rsquared <chr>
# Clean the data frame
lfcTable.rank <- lfcTable.rank %>%
# Nest by metric
group_by(Metric) %>%
nest() %>%
# Add columns storing scatter plots
mutate(RankPlot=map(data, ~ rank.plot.fn(.x, Metric)),
RankPlot.log=map(data, ~ rank.plot.fn(.x, Metric, T)))
# Set a function simplifying grid.arrange()
gr.ar.fn3 <- function(pl1, pl2) {
grid.arrange(pl1, pl2, nrow=1)
}
# Explore the plots
gr.ar.fn3(lfcTable.rank$RankPlot[[1]],
lfcTable.rank$RankPlot.log[[1]])

gr.ar.fn3(lfcTable.rank$RankPlot[[2]],
lfcTable.rank$RankPlot.log[[2]])

gr.ar.fn3(lfcTable.rank$RankPlot[[3]],
lfcTable.rank$RankPlot.log[[3]])

# Complete the optimal presentation for the plots
gr.ar.fn2(lfcTable.rank$RankPlot.log[[1]],
lfcTable.rank$RankPlot.log[[2]],
lfcTable.rank$RankPlot[[3]])

Saving difference tables
for (x in my.param) {
# Subset by metric
df <- subset(lfcTable.percent, Metric == x)[, 1:5]
# Determine whether ascending or descending order is needed
if (x == "padj") {
# Descending for padj
ordering <- F
} else {
# Ascending otherwise
ordering <- T
}
# Rearrange the data frame using the function rank.fn() set previously
df <- rank.fn(df, abs(df$Percent), desc=ordering)
# Print the output data frame
print(head(df))
# Save as a csv file
write.csv(df,
paste0(x, "_difference_SA_ST.csv"))
}
## # A tibble: 6 x 5
## # Groups: Metric [1]
## Metric GENEID Difference num.trans Percent
## <chr> <chr> <dbl> <int> <dbl>
## 1 baseMean ENSMUSG00000073489 -539. 2 -93.9
## 2 baseMean ENSMUSG00000073647 -2206. 1 -89.3
## 3 baseMean ENSMUSG00000079110 372. 14 76.6
## 4 baseMean ENSMUSG00000096401 -2260. 1 -67.1
## 5 baseMean ENSMUSG00000095661 -209. 2 -57.9
## 6 baseMean ENSMUSG00000102918 401. 3 57.0
## # A tibble: 6 x 5
## # Groups: Metric [1]
## Metric GENEID Difference num.trans Percent
## <chr> <chr> <dbl> <int> <dbl>
## 1 log2FoldChange ENSMUSG00000073647 -0.520 1 -48.0
## 2 log2FoldChange ENSMUSG00000090691 -1.31 1 -47.6
## 3 log2FoldChange ENSMUSG00000073489 -0.708 2 -32.0
## 4 log2FoldChange ENSMUSG00000095661 -0.333 2 -27.6
## 5 log2FoldChange ENSMUSG00000039431 -0.373 6 -26.0
## 6 log2FoldChange ENSMUSG00000096401 -0.124 1 -20.9
## # A tibble: 6 x 5
## # Groups: Metric [1]
## Metric GENEID Difference num.trans Percent
## <chr> <chr> <dbl> <int> <dbl>
## 1 padj ENSMUSG00000024101 -0.0000383 8 -0.0961
## 2 padj ENSMUSG00000060961 -0.0000740 10 -0.135
## 3 padj ENSMUSG00000055805 0.0000119 7 0.216
## 4 padj ENSMUSG00000027508 0.000201 5 0.219
## 5 padj ENSMUSG00000026249 -0.000000321 6 -0.244
## 6 padj ENSMUSG00000042351 0.000195 5 0.255
Saving gene rankings
lfcTable.csv <- lfcTable %>%
dplyr::select(GENEID, starts_with(my.param[-1])) %>%
gather(Category, Value, -GENEID) %>%
separate(Category, c("Metric", "Aligner")) %>%
pivot_wider(names_from=Metric, values_from=Value)
write.csv(lfcTable.csv, "lfcTable.csv")
Session Info
sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-conda-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.2 LTS
##
## Matrix products: default
## BLAS: /home/mira/miniconda3/envs/r/lib/libblas.so.3.8.0
## LAPACK: /home/mira/miniconda3/envs/r/lib/liblapack.so.3.8.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats4 parallel stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] gridExtra_2.3 ensembldb_2.14.0
## [3] AnnotationFilter_1.14.0 GenomicFeatures_1.42.1
## [5] AnnotationDbi_1.52.0 UpSetR_1.4.0
## [7] DESeq2_1.30.1 SummarizedExperiment_1.20.0
## [9] Biobase_2.50.0 MatrixGenerics_1.2.0
## [11] matrixStats_0.58.0 GenomicRanges_1.42.0
## [13] GenomeInfoDb_1.26.0 IRanges_2.24.1
## [15] S4Vectors_0.28.1 Rsubread_2.4.0
## [17] tximport_1.18.0 AnnotationHub_2.22.0
## [19] BiocFileCache_1.14.0 dbplyr_2.1.0
## [21] BiocGenerics_0.36.0 pheatmap_1.0.12
## [23] rmarkdown_2.7 forcats_0.5.1
## [25] stringr_1.4.0 dplyr_1.0.5
## [27] purrr_0.3.4 readr_1.4.0
## [29] tidyr_1.1.3 tibble_3.1.0
## [31] ggplot2_3.3.3 tidyverse_1.3.0
## [33] data.table_1.14.0
##
## loaded via a namespace (and not attached):
## [1] colorspace_2.0-0 ellipsis_0.3.1
## [3] XVector_0.30.0 fs_1.5.0
## [5] rstudioapi_0.13 farver_2.0.3
## [7] bit64_4.0.5 interactiveDisplayBase_1.28.0
## [9] fansi_0.4.2 lubridate_1.7.10
## [11] xml2_1.3.2 splines_4.0.3
## [13] cachem_1.0.4 geneplotter_1.68.0
## [15] knitr_1.31 jsonlite_1.7.2
## [17] Rsamtools_2.6.0 broom_0.7.5
## [19] annotate_1.68.0 shiny_1.6.0
## [21] BiocManager_1.30.10 compiler_4.0.3
## [23] httr_1.4.2 backports_1.2.1
## [25] lazyeval_0.2.2 assertthat_0.2.1
## [27] Matrix_1.3-2 fastmap_1.1.0
## [29] cli_2.3.1 later_1.1.0.1
## [31] prettyunits_1.1.1 htmltools_0.5.1.1
## [33] tools_4.0.3 gtable_0.3.0
## [35] glue_1.4.2 GenomeInfoDbData_1.2.4
## [37] rappdirs_0.3.3 Rcpp_1.0.6
## [39] cellranger_1.1.0 jquerylib_0.1.3
## [41] Biostrings_2.58.0 vctrs_0.3.6
## [43] rtracklayer_1.50.0 xfun_0.21
## [45] ps_1.5.0 rvest_0.3.6
## [47] mime_0.10 lifecycle_1.0.0
## [49] XML_3.99-0.5 zlibbioc_1.36.0
## [51] scales_1.1.1 ProtGenerics_1.22.0
## [53] hms_1.0.0 promises_1.2.0.1
## [55] RColorBrewer_1.1-2 yaml_2.2.1
## [57] curl_4.3 memoise_2.0.0
## [59] sass_0.3.1 biomaRt_2.46.3
## [61] stringi_1.5.3 RSQLite_2.2.3
## [63] highr_0.8 BiocVersion_3.12.0
## [65] genefilter_1.72.1 BiocParallel_1.24.0
## [67] rlang_0.4.10 pkgconfig_2.0.3
## [69] bitops_1.0-6 evaluate_0.14
## [71] lattice_0.20-41 labeling_0.4.2
## [73] GenomicAlignments_1.26.0 bit_4.0.4
## [75] tidyselect_1.1.0 plyr_1.8.6
## [77] magrittr_2.0.1 R6_2.5.0
## [79] generics_0.1.0 DelayedArray_0.16.0
## [81] DBI_1.1.1 pillar_1.5.0
## [83] haven_2.3.1 withr_2.4.1
## [85] survival_3.2-7 RCurl_1.98-1.2
## [87] modelr_0.1.8 crayon_1.4.1
## [89] utf8_1.1.4 progress_1.2.2
## [91] locfit_1.5-9.4 grid_4.0.3
## [93] readxl_1.3.1 blob_1.2.1
## [95] reprex_1.0.0 digest_0.6.27
## [97] xtable_1.8-4 httpuv_1.5.5
## [99] openssl_1.4.3 munsell_0.5.0
## [101] bslib_0.2.4 askpass_1.1